Repository: rhasspy/rhasspy3 Branch: master Commit: 11e8d3016d32 Files: 258 Total size: 454.7 KB Directory structure: gitextract_7danshym/ ├── .gitignore ├── .gitmodules ├── .isort.cfg ├── LICENSE.md ├── README.md ├── bin/ │ ├── asr_adapter_raw2text.py │ ├── asr_adapter_wav2text.py │ ├── asr_transcribe.py │ ├── asr_transcribe_stream.py │ ├── asr_transcribe_wav.py │ ├── client_unix_socket.py │ ├── config_print.py │ ├── handle_adapter_json.py │ ├── handle_adapter_text.py │ ├── handle_intent.py │ ├── handle_text.py │ ├── intent_recognize.py │ ├── mic_adapter_raw.py │ ├── mic_record_sample.py │ ├── mic_test_energy.py │ ├── pipeline_run.py │ ├── program_download.py │ ├── program_install.py │ ├── satellite_run.py │ ├── server_run.py │ ├── snd_adapter_raw.py │ ├── snd_play.py │ ├── tts_adapter_http.py │ ├── tts_adapter_text2wav.py │ ├── tts_speak.py │ ├── tts_synthesize.py │ ├── vad_adapter_raw.py │ ├── vad_segment_wav.py │ ├── wake_adapter_raw.py │ └── wake_detect.py ├── docs/ │ ├── README.md │ ├── adapters.md │ ├── domains.md │ ├── home_assistant.md │ ├── satellite.md │ ├── tutorial.md │ └── wyoming.md ├── examples/ │ └── satellite/ │ └── configuration.yaml ├── mypy.ini ├── programs/ │ ├── asr/ │ │ ├── coqui-stt/ │ │ │ ├── README.md │ │ │ ├── bin/ │ │ │ │ ├── coqui_stt_raw2text.py │ │ │ │ ├── coqui_stt_server.py │ │ │ │ └── coqui_stt_wav2text.py │ │ │ ├── requirements.txt │ │ │ └── script/ │ │ │ ├── download.py │ │ │ ├── raw2text │ │ │ ├── server │ │ │ ├── setup │ │ │ └── wav2text │ │ ├── faster-whisper/ │ │ │ ├── README.md │ │ │ ├── bin/ │ │ │ │ ├── faster_whisper_server.py │ │ │ │ └── faster_whisper_wav2text.py │ │ │ ├── script/ │ │ │ │ ├── download.py │ │ │ │ ├── server │ │ │ │ ├── setup │ │ │ │ └── wav2text │ │ │ └── src/ │ │ │ ├── LICENSE │ │ │ ├── README.md │ │ │ ├── faster_whisper/ │ │ │ │ ├── __init__.py │ │ │ │ ├── audio.py │ │ │ │ ├── feature_extractor.py │ │ │ │ └── transcribe.py │ │ │ ├── requirements.conversion.txt │ │ │ ├── requirements.txt │ │ │ └── setup.py │ │ ├── pocketsphinx/ │ │ │ ├── README.md │ │ │ ├── bin/ │ │ │ │ ├── pocketsphinx_raw2text.py │ │ │ │ ├── pocketsphinx_server.py │ │ │ │ └── pocketsphinx_wav2text.py │ │ │ ├── requirements.txt │ │ │ └── script/ │ │ │ ├── download.py │ │ │ ├── raw2text │ │ │ ├── server │ │ │ ├── setup │ │ │ └── wav2text │ │ ├── vosk/ │ │ │ ├── README.md │ │ │ ├── bin/ │ │ │ │ ├── vosk_raw2text.py │ │ │ │ ├── vosk_server.py │ │ │ │ └── vosk_wav2text.py │ │ │ ├── requirements.txt │ │ │ └── script/ │ │ │ ├── download.py │ │ │ ├── raw2text │ │ │ ├── server │ │ │ ├── setup │ │ │ └── wav2text │ │ ├── whisper/ │ │ │ ├── README.md │ │ │ ├── bin/ │ │ │ │ ├── whisper_server.py │ │ │ │ └── whisper_wav2text.py │ │ │ ├── requirements.txt │ │ │ └── script/ │ │ │ ├── server │ │ │ ├── setup │ │ │ └── wav2text │ │ └── whisper-cpp/ │ │ ├── .gitignore │ │ ├── Dockerfile.libwhisper │ │ ├── Dockerfile.libwhisper.dockerignore │ │ ├── README.md │ │ ├── bin/ │ │ │ ├── whisper_cpp_server.py │ │ │ └── whisper_cpp_wav2text.py │ │ ├── lib/ │ │ │ ├── Makefile │ │ │ └── whisper_cpp.py │ │ ├── requirements.txt │ │ └── script/ │ │ ├── build_libwhisper │ │ ├── download.py │ │ ├── server │ │ ├── setup │ │ ├── setup.py │ │ └── wav2text │ ├── handle/ │ │ ├── date_time/ │ │ │ └── bin/ │ │ │ └── date_time.py │ │ └── home_assistant/ │ │ └── bin/ │ │ └── converse.py │ ├── intent/ │ │ └── regex/ │ │ └── bin/ │ │ └── regex.py │ ├── mic/ │ │ ├── pyaudio/ │ │ │ ├── README.md │ │ │ ├── bin/ │ │ │ │ ├── pyaudio_events.py │ │ │ │ ├── pyaudio_list_mics.py │ │ │ │ ├── pyaudio_raw.py │ │ │ │ └── pyaudio_shared.py │ │ │ ├── requirements.txt │ │ │ └── script/ │ │ │ ├── events │ │ │ ├── list_mics │ │ │ ├── raw │ │ │ └── setup │ │ ├── sounddevice/ │ │ │ ├── README.md │ │ │ ├── bin/ │ │ │ │ ├── sounddevice_events.py │ │ │ │ ├── sounddevice_list_mics.py │ │ │ │ ├── sounddevice_raw.py │ │ │ │ └── sounddevice_shared.py │ │ │ ├── requirements.txt │ │ │ └── script/ │ │ │ ├── events │ │ │ ├── list_mics │ │ │ ├── raw │ │ │ └── setup │ │ └── udp_raw/ │ │ └── bin/ │ │ └── udp_raw.py │ ├── remote/ │ │ └── websocket/ │ │ ├── bin/ │ │ │ └── stream2stream.py │ │ ├── requirements.txt │ │ └── script/ │ │ ├── run │ │ └── setup │ ├── snd/ │ │ └── udp_raw/ │ │ └── bin/ │ │ └── udp_raw.py │ ├── tts/ │ │ ├── coqui-tts/ │ │ │ ├── README.md │ │ │ ├── requirements.txt │ │ │ └── script/ │ │ │ ├── list_models │ │ │ ├── server │ │ │ └── setup │ │ ├── flite/ │ │ │ └── script/ │ │ │ ├── download.py │ │ │ └── setup │ │ ├── larynx/ │ │ │ ├── README.md │ │ │ ├── bin/ │ │ │ │ └── larynx_client.py │ │ │ ├── requirements.txt │ │ │ └── script/ │ │ │ ├── list_models │ │ │ ├── server │ │ │ └── setup │ │ ├── marytts/ │ │ │ └── bin/ │ │ │ └── marytts.py │ │ ├── mimic3/ │ │ │ ├── README.md │ │ │ ├── bin/ │ │ │ │ └── mimic3_server.py │ │ │ ├── requirements.txt │ │ │ └── script/ │ │ │ ├── server │ │ │ └── setup │ │ └── piper/ │ │ ├── README.md │ │ ├── bin/ │ │ │ └── piper_server.py │ │ └── script/ │ │ ├── download.py │ │ ├── server │ │ └── setup.py │ ├── vad/ │ │ ├── energy/ │ │ │ └── bin/ │ │ │ └── energy_speech_prob.py │ │ ├── silero/ │ │ │ ├── README.md │ │ │ ├── bin/ │ │ │ │ └── silero_speech_prob.py │ │ │ ├── requirements.txt │ │ │ ├── script/ │ │ │ │ ├── setup │ │ │ │ └── speech_prob │ │ │ └── share/ │ │ │ └── silero_vad.onnx │ │ └── webrtcvad/ │ │ ├── README.md │ │ ├── bin/ │ │ │ └── webrtcvad_speech_prob.py │ │ ├── requirements.txt │ │ └── script/ │ │ ├── setup │ │ └── speech_prob │ └── wake/ │ ├── porcupine1/ │ │ ├── bin/ │ │ │ ├── list_models.py │ │ │ ├── porcupine_raw_text.py │ │ │ ├── porcupine_shared.py │ │ │ └── porcupine_stream.py │ │ ├── requirements.txt │ │ └── script/ │ │ ├── download.py │ │ ├── list_models │ │ ├── raw2text │ │ └── setup │ ├── precise-lite/ │ │ ├── bin/ │ │ │ └── precise.py │ │ ├── requirements.txt │ │ ├── script/ │ │ │ └── setup │ │ └── share/ │ │ └── hey_mycroft.tflite │ └── snowboy/ │ ├── bin/ │ │ └── snowboy_raw_text.py │ ├── requirements.txt │ ├── script/ │ │ └── setup │ └── share/ │ ├── hey_extreme.umdl │ ├── jarvis.umdl │ ├── neoya.umdl │ ├── smart_mirror.umdl │ ├── snowboy.umdl │ ├── subex.umdl │ └── view_glass.umdl ├── pylintrc ├── requirements_dev.txt ├── requirements_http_api.txt ├── rhasspy3/ │ ├── VERSION │ ├── __init__.py │ ├── asr.py │ ├── audio.py │ ├── config.py │ ├── configuration.yaml │ ├── core.py │ ├── event.py │ ├── handle.py │ ├── intent.py │ ├── mic.py │ ├── pipeline.py │ ├── program.py │ ├── py.typed │ ├── remote.py │ ├── snd.py │ ├── tts.py │ ├── util/ │ │ ├── __init__.py │ │ ├── dataclasses_json.py │ │ └── jaml.py │ ├── vad.py │ └── wake.py ├── rhasspy3_http_api/ │ ├── __init__.py │ ├── __main__.py │ ├── asr.py │ ├── css/ │ │ └── main.css │ ├── handle.py │ ├── intent.py │ ├── js/ │ │ ├── main.js │ │ └── recorder.worklet.js │ ├── pipeline.py │ ├── snd.py │ ├── templates/ │ │ ├── asr.html │ │ ├── index.html │ │ ├── layout.html │ │ ├── pipeline.html │ │ ├── satellite.html │ │ └── tts.html │ ├── tts.py │ └── wake.py ├── script/ │ ├── format │ ├── http_server │ ├── lint │ ├── run │ ├── setup │ ├── setup_http_server │ └── test ├── setup.cfg ├── setup.py ├── tests/ │ ├── test_dataclasses_json.py │ └── test_jaml.py └── tools/ └── websocket-client/ ├── bin/ │ └── websocket_client.py ├── requirements.txt └── script/ ├── run └── setup ================================================ FILE CONTENTS ================================================ ================================================ FILE: .gitignore ================================================ .DS_Store .idea *.log tmp/ *.py[cod] *.egg /build htmlcov .projectile .venv/ venv/ .mypy_cache/ *.egg-info/ /local/ ================================================ FILE: .gitmodules ================================================ [submodule "programs/asr/whisper.cpp/build/whisper.cpp"] path = programs/asr/whisper.cpp/build/whisper.cpp url = https://github.com/ggerganov/whisper.cpp ================================================ FILE: .isort.cfg ================================================ [settings] multi_line_output=3 include_trailing_comma=True force_grid_wrap=0 use_parentheses=True line_length=88 ================================================ FILE: LICENSE.md ================================================ MIT License Copyright (c) 2022 Michael Hansen Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ================================================ FILE: README.md ================================================ ![Rhasspy 3](img/banner.png) **NOTE: This is a very early developer preview!** An open source toolkit for building voice assistants. ![Voice assistant pipeline](img/pipeline.png) Rhasspy focuses on: * Privacy - no data leaves your computer unless you want it to * Broad language support - more than just English * Customization - everything can be changed ## Getting Started * Check out the [tutorial](docs/tutorial.md) * Connect Rhasspy to [Home Assistant](docs/home_assistant.md) * Install the [Rhasspy 3 add-on](https://github.com/rhasspy/hassio-addons) * Run one or more [satellites](docs/satellite.md) * Join [the community](https://community.rhasspy.org/) ## Missing Pieces This is a developer preview, so there are lots of things missing: * A user friendly web UI * An automated method for installing programs/services and downloading models * Support for custom speech to text grammars * Intent systems besides Home Assistant * The ability to accumulate context within a pipeline ## Core Concepts ### Domains Rhasspy is organized by [domain](docs/domains.md): * mic - audio input * wake - wake word detection * asr - speech to text * vad - voice activity detection * intent - intent recognition from text * handle - intent or text input handling * tts - text to speech * snd - audio output ### Programs Rhasspy talks to external programs using the [Wyoming protocol](docs/wyoming.md). You can add your own programs by implementing the protocol or using an [adapter](#adapters). ### Adapters [Small scripts](docs/adapters.md) that live in `bin/` and bridge existing programs into the [Wyoming protocol](docs/wyoming.md). For example, a speech to text program (`asr`) that accepts a WAV file and outputs text can use `asr_adapter_wav2text.py` ### Pipelines Complete voice loop from microphone input (mic) to speaker output (snd). Stages are: 1. detect (optional) * Wait until wake word is detected in mic 2. transcribe * Listen until vad detects silence, then convert audio to text 3. recognize (optional) * Recognize an intent from text 4. handle * Handle an intent or text, producing a text response 5. speak * Convert handle output text to speech, and speak through snd ### Servers Some programs take a while to load, so it's best to leave them running as a server. Use `bin/server_run.py` or add `--server ` when running the HTTP server. See `servers` section of `configuration.yaml` file. --- ## Supported Programs * mic * [arecord](https://alsa-project.org/wiki/Main_Page) * [gstreamer_udp](https://gstreamer.freedesktop.org/) * [sounddevice](https://python-sounddevice.readthedocs.io) * [pyaudio](https://people.csail.mit.edu/hubert/pyaudio/docs/) * wake * [porcupine1](https://github.com/Picovoice/porcupine) * [precise-lite](https://github.com/mycroftAI/mycroft-precise) * [snowboy](https://github.com/Kitt-AI/snowboy) * vad * [silero](https://github.com/snakers4/silero-vad) * [webrtcvad](https://pypi.org/project/webrtcvad/) * asr * [whisper](https://github.com/openai/whisper) * [whisper-cpp](https://github.com/ggerganov/whisper.cpp/) * [faster-whisper](https://github.com/guillaumekln/faster-whisper/) * [vosk](https://alphacephei.com/vosk/) * [coqui-stt](https://stt.readthedocs.io) * [pocketsphinx](https://github.com/cmusphinx/pocketsphinx) * handle * [home_assistant_conversation](https://www.home-assistant.io/docs/assist) * tts * [piper](https://github.com/rhasspy/piper/) * [mimic3](https://github.com/mycroftAI/mimic3) * [larynx](https://github.com/rhasspy/larynx/) * [coqui-tts](https://tts.readthedocs.io) * [marytts](http://mary.dfki.de/) * [flite](http://www.festvox.org/flite/) * [festival](http://www.cstr.ed.ac.uk/projects/festival/) * [espeak-ng](https://github.com/espeak-ng/espeak-ng/) * snd * [aplay](https://alsa-project.org/wiki/Main_Page) * [gstreamer_udp](https://gstreamer.freedesktop.org/) --- ## HTTP API `http://localhost:13331/` Unless overridden, the pipeline named "default" is used. * `/pipeline/run` * Runs a full pipeline from mic to snd * Produces JSON * Override `pipeline` or: * `wake_program` * `asr_program` * `intent_program` * `handle_program` * `tts_program` * `snd_program` * Skip stages with `start_after` * `wake` - skip detection, body is detection name (text) * `asr` - skip recording, body is transcript (text) or WAV audio * `intent` - skip recognition, body is intent/not-recognized event (JSON) * `handle` - skip handling, body is handle/not-handled event (JSON) * `tts` - skip synthesis, body is WAV audio * Stop early with `stop_after` * `wake` - only detection * `asr` - detection and transcription * `intent` - detection, transcription, recognition * `handle` - detection, transcription, recognition, handling * `tts` - detection, transcription, recognition, handling, synthesis * `/wake/detect` * Detect wake word in WAV input * Produces JSON * Override `wake_program` or `pipeline` * `/asr/transcribe` * Transcribe audio from WAV input * Produces JSON * Override `asr_program` or `pipeline` * `/intent/recognize` * Recognizes intent from text body (POST) or `text` (GET) * Produces JSON * Override `intent_program` or `pipeline` * `/handle/handle` * Handles intent/text from body (POST) or `input` (GET) * `Content-Type` must be `application/json` for intent input * Override `handle_program` or `pipeline` * `/tts/synthesize` * Synthesizes audio from text body (POST) or `text` (GET) * Produces WAV audio * Override `tts_program` or `pipeline` * `/tts/speak` * Plays audio from text body (POST) or `text` (GET) * Produces JSON * Override `tts_program`, `snd_program`, or `pipeline` * `/snd/play` * Plays WAV audio via snd * Override `snd_program` or `pipeline` * `/config` * Returns JSON config * `/version` * Returns version info ## WebSocket API `ws://localhost:13331/` Audio streams are raw PCM in binary messages. Use the `rate`, `width`, and `channels` parameters for sample rate (hertz), width (bytes), and channel count. By default, input audio is 16Khz 16-bit mono, and output audio is 22Khz 16-bit mono. The client can "end" the audio stream by sending an empty binary message. * `/pipeline/asr-tts` * Run pipeline from asr (stream in) to tts (stream out) * Produces JSON messages as events happen * Override `pipeline` or: * `asr_program` * `vad_program` * `handle_program` * `tts_program` * Use `in_rate`, `in_width`, `in_channels` for audio input format * Use `out_rate`, `out_width`, `out_channels` for audio output format * `/wake/detect` * Detect wake word from websocket audio stream * Produces a JSON message when audio stream ends * Override `wake_program` or `pipeline` * `/asr/transcribe` * Transcribe a websocket audio stream * Produces a JSON message when audio stream ends * Override `asr_program` or `pipeline` * `/snd/play` * Play a websocket audio stream * Produces a JSON message when audio stream ends * Override `snd_program` or `pipeline` ================================================ FILE: bin/asr_adapter_raw2text.py ================================================ #!/usr/bin/env python3 import argparse import logging import shlex import subprocess from pathlib import Path from rhasspy3.asr import Transcript from rhasspy3.audio import AudioChunk, AudioChunkConverter, AudioStop from rhasspy3.event import read_event, write_event _FILE = Path(__file__) _DIR = _FILE.parent _LOGGER = logging.getLogger(_FILE.stem) def main() -> None: parser = argparse.ArgumentParser() parser.add_argument( "command", help="Command to run", ) parser.add_argument("--shell", action="store_true") # parser.add_argument( "--rate", type=int, help="Sample rate (hz)", ) parser.add_argument( "--width", type=int, help="Sample width bytes", ) parser.add_argument( "--channels", type=int, help="Sample channel count", ) parser.add_argument( "--debug", action="store_true", help="Print DEBUG messages to console" ) args = parser.parse_args() logging.basicConfig(level=logging.DEBUG if args.debug else logging.INFO) if args.shell: command = args.command else: command = shlex.split(args.command) proc = subprocess.Popen( command, stdin=subprocess.PIPE, stdout=subprocess.PIPE, shell=args.shell ) text = "" converter = AudioChunkConverter(args.rate, args.width, args.channels) with proc: assert proc.stdin is not None assert proc.stdout is not None while True: event = read_event() if event is None: break if AudioChunk.is_type(event.type): chunk = AudioChunk.from_event(event) chunk = converter.convert(chunk) proc.stdin.write(chunk.audio) proc.stdin.flush() elif AudioStop.is_type(event.type): break stdout, _stderr = proc.communicate() text = stdout.decode() write_event(Transcript(text=text.strip()).event()) if __name__ == "__main__": main() ================================================ FILE: bin/asr_adapter_wav2text.py ================================================ #!/usr/bin/env python3 import argparse import logging import shlex import subprocess import tempfile import wave from pathlib import Path from rhasspy3.asr import Transcript from rhasspy3.audio import AudioChunk, AudioStop from rhasspy3.event import read_event, write_event _FILE = Path(__file__) _DIR = _FILE.parent _LOGGER = logging.getLogger(_FILE.stem) def main() -> None: parser = argparse.ArgumentParser() parser.add_argument( "command", help="Command to run", ) parser.add_argument("--shell", action="store_true") parser.add_argument( "--debug", action="store_true", help="Print DEBUG messages to console" ) args = parser.parse_args() logging.basicConfig(level=logging.DEBUG if args.debug else logging.INFO) with tempfile.NamedTemporaryFile(mode="wb+", suffix=".wav") as wav_io: args.command = args.command.format(wav_file=wav_io.name) if args.shell: command = args.command else: command = shlex.split(args.command) wav_params_set = False wav_file: wave.Wave_write = wave.open(wav_io, "wb") try: with wav_file: while True: event = read_event() if event is None: break if AudioChunk.is_type(event.type): chunk = AudioChunk.from_event(event) if not wav_params_set: wav_file.setframerate(chunk.rate) wav_file.setsampwidth(chunk.width) wav_file.setnchannels(chunk.channels) wav_params_set = True wav_file.writeframes(chunk.audio) elif AudioStop.is_type(event.type): break wav_io.seek(0) text = subprocess.check_output(command, shell=args.shell).decode() write_event(Transcript(text=text.strip()).event()) except wave.Error: pass if __name__ == "__main__": main() ================================================ FILE: bin/asr_transcribe.py ================================================ #!/usr/bin/env python3 """Transcribes mic audio into text.""" import argparse import asyncio import json import logging import sys from pathlib import Path from rhasspy3.asr import DOMAIN, Transcript from rhasspy3.core import Rhasspy from rhasspy3.event import async_read_event from rhasspy3.mic import DOMAIN as MIC_DOMAIN from rhasspy3.program import create_process from rhasspy3.vad import segment _FILE = Path(__file__) _DIR = _FILE.parent _LOGGER = logging.getLogger(_FILE.stem) async def main() -> None: parser = argparse.ArgumentParser() parser.add_argument( "-c", "--config", default=_DIR.parent / "config", help="Configuration directory", ) parser.add_argument( "-p", "--pipeline", default="default", help="Name of pipeline to use" ) parser.add_argument( "--mic-program", help="Name of mic program to use (overrides pipeline)" ) parser.add_argument( "--asr-program", help="Name of asr program to use (overrides pipeline)" ) parser.add_argument( "--vad-program", help="Name of vad program to use (overrides pipeline)" ) # parser.add_argument( "--debug", action="store_true", help="Print DEBUG messages to console" ) args = parser.parse_args() logging.basicConfig(level=logging.DEBUG if args.debug else logging.INFO) rhasspy = Rhasspy.load(args.config) mic_program = args.mic_program asr_program = args.asr_program vad_program = args.vad_program pipeline = rhasspy.config.pipelines.get(args.pipeline) if not mic_program: assert pipeline is not None, f"No pipeline named {args.pipeline}" mic_program = pipeline.mic assert mic_program, "No mic program" if not asr_program: assert pipeline is not None, f"No pipeline named {args.pipeline}" asr_program = pipeline.asr assert asr_program, "No asr program" if not vad_program: assert pipeline is not None, f"No pipeline named {args.pipeline}" vad_program = pipeline.vad assert vad_program, "No vad program" # Transcribe voice command async with (await create_process(rhasspy, MIC_DOMAIN, mic_program)) as mic_proc, ( await create_process(rhasspy, DOMAIN, asr_program) ) as asr_proc: assert mic_proc.stdout is not None assert asr_proc.stdin is not None assert asr_proc.stdout is not None _LOGGER.info("Ready") await segment(rhasspy, vad_program, mic_proc.stdout, asr_proc.stdin) # Read transcript _LOGGER.debug("Waiting for transcript") transcript = Transcript(text="") while True: event = await async_read_event(asr_proc.stdout) if event is None: break if Transcript.is_type(event.type): transcript = Transcript.from_event(event) break json.dump(transcript.event().to_dict(), sys.stdout, ensure_ascii=False) print("", flush=True) if __name__ == "__main__": asyncio.run(main()) ================================================ FILE: bin/asr_transcribe_stream.py ================================================ #!/usr/bin/env python3 """Transcribes raw audio from stdin into text.""" import argparse import asyncio import json import logging import sys from pathlib import Path from rhasspy3.asr import DOMAIN, Transcript from rhasspy3.audio import ( DEFAULT_IN_CHANNELS, DEFAULT_IN_RATE, DEFAULT_IN_WIDTH, DEFAULT_SAMPLES_PER_CHUNK, AudioChunk, AudioChunkConverter, AudioStart, AudioStop, ) from rhasspy3.core import Rhasspy from rhasspy3.event import async_read_event, async_write_event from rhasspy3.program import create_process _FILE = Path(__file__) _DIR = _FILE.parent _LOGGER = logging.getLogger(_FILE.stem) async def main() -> None: parser = argparse.ArgumentParser() parser.add_argument( "-c", "--config", default=_DIR.parent / "config", help="Configuration directory", ) parser.add_argument( "-p", "--pipeline", default="default", help="Name of pipeline to use" ) parser.add_argument( "--asr-program", help="Name of asr program to use (overrides pipeline)" ) # parser.add_argument( "--mic-rate", type=int, default=DEFAULT_IN_RATE, help="Input sample rate (hertz)", ) parser.add_argument( "--mic-width", type=int, default=DEFAULT_IN_WIDTH, help="Input sample width (bytes)", ) parser.add_argument( "--mic-channels", type=int, default=DEFAULT_IN_CHANNELS, help="Input sample channel count", ) # parser.add_argument( "--asr-rate", type=int, default=DEFAULT_IN_RATE, help="asr sample rate (hertz)" ) parser.add_argument( "--asr-width", type=int, default=DEFAULT_IN_WIDTH, help="asr sample width (bytes)", ) parser.add_argument( "--asr-channels", type=int, default=DEFAULT_IN_CHANNELS, help="asr sample channel count", ) parser.add_argument( "--samples-per-chunk", type=int, default=DEFAULT_SAMPLES_PER_CHUNK, help="Samples to process per chunk", ) # parser.add_argument( "--debug", action="store_true", help="Print DEBUG messages to console" ) args = parser.parse_args() logging.basicConfig(level=logging.DEBUG if args.debug else logging.INFO) rhasspy = Rhasspy.load(args.config) asr_program = args.asr_program pipeline = rhasspy.config.pipelines.get(args.pipeline) if not asr_program: assert pipeline is not None, f"No pipeline named {args.pipeline}" asr_program = pipeline.asr assert asr_program, "No asr program" _LOGGER.debug("asr program: %s", asr_program) # Transcribe raw audio from stdin converter = AudioChunkConverter(args.asr_rate, args.asr_width, args.asr_channels) bytes_per_chunk = args.samples_per_chunk * args.mic_width * args.mic_channels timestamp = 0 async with (await create_process(rhasspy, DOMAIN, asr_program)) as asr_proc: assert asr_proc.stdin is not None assert asr_proc.stdout is not None _LOGGER.debug("Started %s", asr_program) await async_write_event( AudioStart( args.asr_rate, args.asr_width, args.asr_channels, timestamp=timestamp ).event(), asr_proc.stdin, ) audio_bytes = sys.stdin.buffer.read(bytes_per_chunk) while audio_bytes: chunk = AudioChunk( args.mic_rate, args.mic_width, args.mic_channels, audio_bytes, timestamp=timestamp, ) timestamp += chunk.milliseconds chunk = converter.convert(chunk) # Write audio await async_write_event( chunk.event(), asr_proc.stdin, ) audio_bytes = sys.stdin.buffer.read(bytes_per_chunk) await async_write_event(AudioStop(timestamp=timestamp).event(), asr_proc.stdin) # Read transcript transcript = Transcript(text="") while True: event = await async_read_event(asr_proc.stdout) if event is None: break if Transcript.is_type(event.type): transcript = Transcript.from_event(event) break json.dump(transcript.event().to_dict(), sys.stdout, ensure_ascii=False) print("", flush=True) if __name__ == "__main__": asyncio.run(main()) ================================================ FILE: bin/asr_transcribe_wav.py ================================================ #!/usr/bin/env python3 """Transcribes WAV audio into text.""" import argparse import asyncio import io import json import logging import os import sys import time import wave from pathlib import Path from typing import Iterable, Optional from rhasspy3.asr import DOMAIN, Transcript from rhasspy3.audio import ( DEFAULT_IN_CHANNELS, DEFAULT_IN_RATE, DEFAULT_IN_WIDTH, DEFAULT_SAMPLES_PER_CHUNK, AudioChunkConverter, AudioStart, AudioStop, wav_to_chunks, ) from rhasspy3.core import Rhasspy from rhasspy3.event import async_read_event, async_write_event from rhasspy3.program import create_process _FILE = Path(__file__) _DIR = _FILE.parent _LOGGER = logging.getLogger(_FILE.stem) async def main() -> None: parser = argparse.ArgumentParser() parser.add_argument( "-c", "--config", default=_DIR.parent / "config", help="Configuration directory", ) parser.add_argument( "-p", "--pipeline", default="default", help="Name of pipeline to use" ) parser.add_argument( "--asr-program", help="Name of asr program to use (overrides pipeline)" ) # parser.add_argument( "--rate", type=int, default=DEFAULT_IN_RATE, help="Sample rate (hertz)" ) parser.add_argument( "--width", type=int, default=DEFAULT_IN_WIDTH, help="Sample width (bytes)" ) parser.add_argument( "--channels", type=int, default=DEFAULT_IN_CHANNELS, help="Sample channel count" ) parser.add_argument( "--samples-per-chunk", type=int, default=DEFAULT_SAMPLES_PER_CHUNK ) # parser.add_argument("wav", nargs="*", help="Path to WAV file(s)") # parser.add_argument( "--debug", action="store_true", help="Print DEBUG messages to console" ) args = parser.parse_args() logging.basicConfig(level=logging.DEBUG if args.debug else logging.INFO) rhasspy = Rhasspy.load(args.config) asr_program = args.asr_program pipeline = rhasspy.config.pipelines.get(args.pipeline) if not asr_program: assert pipeline is not None, f"No pipeline named {args.pipeline}" asr_program = pipeline.asr assert asr_program, "No asr program" _LOGGER.debug("asr program: %s", asr_program) # Transcribe WAV file(s) for wav_bytes in get_wav_bytes(args): converter = AudioChunkConverter(args.rate, args.width, args.channels) with io.BytesIO(wav_bytes) as wav_io: with wave.open(wav_io, "rb") as wav_file: chunks = list(wav_to_chunks(wav_file, args.samples_per_chunk)) async with (await create_process(rhasspy, DOMAIN, asr_program)) as asr_proc: assert asr_proc.stdin is not None assert asr_proc.stdout is not None # Write audio start_time = time.monotonic_ns() await async_write_event( AudioStart(args.rate, args.width, args.channels, timestamp=0).event(), asr_proc.stdin, ) last_timestamp: Optional[int] = None for chunk in chunks: chunk = converter.convert(chunk) await async_write_event(chunk.event(), asr_proc.stdin) last_timestamp = chunk.timestamp await async_write_event( AudioStop(timestamp=last_timestamp).event(), asr_proc.stdin, ) # Read transcript _LOGGER.debug("Waiting for transcription") transcript = Transcript(text="") while True: event = await async_read_event(asr_proc.stdout) if event is None: break if Transcript.is_type(event.type): transcript = Transcript.from_event(event) end_time = time.monotonic_ns() _LOGGER.debug( "Transcribed in %s second(s)", (end_time - start_time) / 1e9 ) break _LOGGER.debug(transcript) json.dump(transcript.event().to_dict(), sys.stdout, ensure_ascii=False) print("", flush=True) def get_wav_bytes(args: argparse.Namespace) -> Iterable[bytes]: """Yields WAV audio from stdin or args.""" if args.wav: # WAV file path(s) for wav_path in args.wav: _LOGGER.debug("Processing %s", wav_path) with open(wav_path, "rb") as wav_file: yield wav_file.read() else: # WAV on stdin if os.isatty(sys.stdin.fileno()): print("Reading WAV audio from stdin", file=sys.stderr) yield sys.stdin.buffer.read() if __name__ == "__main__": asyncio.run(main()) ================================================ FILE: bin/client_unix_socket.py ================================================ #!/usr/bin/env python3 import argparse import logging import socket import threading from rhasspy3.event import read_event, write_event _LOGGER = logging.getLogger("wrapper_unix_socket") def main(): parser = argparse.ArgumentParser() parser.add_argument("socketfile", help="Path to Unix domain socket file") parser.add_argument("--debug", action="store_true", help="Log DEBUG messages") args = parser.parse_args() logging.basicConfig(level=logging.DEBUG if args.debug else logging.INFO) _LOGGER.debug("Connecting to %s", args.socketfile) sock = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) sock.connect(args.socketfile) _LOGGER.debug("Connected") try: with sock.makefile(mode="rwb") as conn_file: read_thread = threading.Thread( target=read_proc, args=(conn_file,), daemon=True ) read_thread.start() write_thread = threading.Thread( target=write_proc, args=(conn_file,), daemon=True ) write_thread.start() write_thread.join() except KeyboardInterrupt: pass def read_proc(conn_file): try: while True: event = read_event(conn_file) if event is None: break write_event(event) except Exception: _LOGGER.exception("Unexpected error in read thread") def write_proc(conn_file): try: while True: event = read_event() if event is None: break write_event(event, conn_file) except Exception: _LOGGER.exception("Unexpected error in write thread") if __name__ == "__main__": main() ================================================ FILE: bin/config_print.py ================================================ #!/usr/bin/env python3 """Prints configuration as JSON.""" import argparse import json import logging import sys from pathlib import Path from rhasspy3.core import Rhasspy _FILE = Path(__file__) _DIR = _FILE.parent _LOGGER = logging.getLogger(_FILE.stem) def main() -> None: parser = argparse.ArgumentParser() parser.add_argument( "-c", "--config", default=_DIR.parent / "config", help="Configuration directory", ) parser.add_argument("--indent", type=int, default=4) parser.add_argument( "--debug", action="store_true", help="Print DEBUG messages to console" ) args = parser.parse_args() logging.basicConfig(level=logging.DEBUG if args.debug else logging.INFO) rhasspy = Rhasspy.load(args.config) json.dump(rhasspy.config_dict, sys.stdout, indent=args.indent, ensure_ascii=False) if __name__ == "__main__": main() ================================================ FILE: bin/handle_adapter_json.py ================================================ #!/usr/bin/env python3 import argparse import json import logging import shlex import subprocess from pathlib import Path from rhasspy3.event import read_event, write_event from rhasspy3.handle import Handled, NotHandled from rhasspy3.intent import Intent, NotRecognized _FILE = Path(__file__) _DIR = _FILE.parent _LOGGER = logging.getLogger(_FILE.stem) def main() -> None: parser = argparse.ArgumentParser() parser.add_argument( "command", help="Command to run", ) parser.add_argument("--shell", action="store_true") parser.add_argument( "--debug", action="store_true", help="Print DEBUG messages to console" ) args = parser.parse_args() logging.basicConfig(level=logging.DEBUG if args.debug else logging.INFO) if args.shell: command = args.command else: command = shlex.split(args.command) proc = subprocess.Popen( command, stdin=subprocess.PIPE, stdout=subprocess.PIPE, shell=args.shell, universal_newlines=True, ) with proc: assert proc.stdin is not None assert proc.stdout is not None while True: event = read_event() if event is None: break if Intent.is_type(event.type): intent = Intent.from_event(event) stdout, _stderr = proc.communicate( input=json.dumps( { "intent": { "name": intent.name, }, "entities": [ {"entity": entity.name, "value": entity.value} for entity in intent.entities or [] ], "slots": { entity.name: entity.value for entity in intent.entities or [] }, }, ensure_ascii=False, ) ) handled = False for line in stdout.splitlines(): line = line.strip() if line: write_event(Handled(text=line).event()) handled = True break if not handled: write_event(NotHandled().event()) break if NotRecognized.is_type(event.type): write_event(NotHandled().event()) break if __name__ == "__main__": main() ================================================ FILE: bin/handle_adapter_text.py ================================================ #!/usr/bin/env python3 import argparse import logging import shlex import subprocess from pathlib import Path from rhasspy3.asr import Transcript from rhasspy3.event import read_event, write_event from rhasspy3.handle import Handled, NotHandled _FILE = Path(__file__) _DIR = _FILE.parent _LOGGER = logging.getLogger(_FILE.stem) def main() -> None: parser = argparse.ArgumentParser() parser.add_argument( "command", help="Command to run", ) parser.add_argument("--shell", action="store_true") parser.add_argument( "--debug", action="store_true", help="Print DEBUG messages to console" ) args = parser.parse_args() logging.basicConfig(level=logging.DEBUG if args.debug else logging.INFO) if args.shell: command = args.command else: command = shlex.split(args.command) proc = subprocess.Popen( command, stdin=subprocess.PIPE, stdout=subprocess.PIPE, shell=args.shell, universal_newlines=True, ) with proc: assert proc.stdin is not None assert proc.stdout is not None while True: event = read_event() if event is None: break if Transcript.is_type(event.type): transcript = Transcript.from_event(event) stdout, _stderr = proc.communicate(input=transcript.text) handled = False for line in stdout.splitlines(): line = line.strip() if line: write_event(Handled(text=line).event()) handled = True break if not handled: write_event(NotHandled().event()) break if __name__ == "__main__": main() ================================================ FILE: bin/handle_intent.py ================================================ #!/usr/bin/env python3 """Handle text or intent.""" import argparse import asyncio import json import logging import os import sys from pathlib import Path from typing import Iterable from rhasspy3.core import Rhasspy from rhasspy3.handle import handle from rhasspy3.intent import Intent _FILE = Path(__file__) _DIR = _FILE.parent _LOGGER = logging.getLogger(_FILE.stem) async def main() -> None: parser = argparse.ArgumentParser() parser.add_argument( "-c", "--config", default=_DIR.parent / "config", help="Configuration directory", ) parser.add_argument( "-p", "--pipeline", default="default", help="Name of pipeline to use" ) parser.add_argument( "--handle-program", help="Name of handle program to use (overrides pipeline)" ) parser.add_argument("intent", nargs="*", help="Intent JSON event(s) to handle") # parser.add_argument( "--debug", action="store_true", help="Print DEBUG messages to console" ) args = parser.parse_args() logging.basicConfig(level=logging.DEBUG if args.debug else logging.INFO) rhasspy = Rhasspy.load(args.config) handle_program = args.handle_program pipeline = rhasspy.config.pipelines.get(args.pipeline) if not handle_program: assert pipeline is not None, f"No pipeline named {args.pipeline}" handle_program = pipeline.handle assert handle_program, "No handle program" for line in get_input(args): # Intent JSON handle_input: Intent = Intent.from_dict(json.loads(line)) handle_result = await handle(rhasspy, handle_program, handle_input) if handle_result is None: _LOGGER.warning("No result") continue _LOGGER.debug(handle_result) json.dump(handle_result.event().to_dict(), sys.stdout, ensure_ascii=False) def get_input(args: argparse.Namespace) -> Iterable[str]: """Get input from stdin or args.""" if args.intent: for event_json in args.intent: yield event_json else: if os.isatty(sys.stdin.fileno()): print("Reading input from stdin", file=sys.stderr) for line in sys.stdin: line = line.strip() if line: yield line if __name__ == "__main__": asyncio.run(main()) ================================================ FILE: bin/handle_text.py ================================================ #!/usr/bin/env python3 """Handle text or intent.""" import argparse import asyncio import json import logging import os import sys from pathlib import Path from typing import Iterable from rhasspy3.asr import Transcript from rhasspy3.core import Rhasspy from rhasspy3.handle import handle _FILE = Path(__file__) _DIR = _FILE.parent _LOGGER = logging.getLogger(_FILE.stem) async def main() -> None: parser = argparse.ArgumentParser() parser.add_argument( "-c", "--config", default=_DIR.parent / "config", help="Configuration directory", ) parser.add_argument( "-p", "--pipeline", default="default", help="Name of pipeline to use" ) parser.add_argument( "--handle-program", help="Name of handle program to use (overrides pipeline)" ) parser.add_argument("text", nargs="*", help="Text input to handle") # parser.add_argument( "--debug", action="store_true", help="Print DEBUG messages to console" ) args = parser.parse_args() logging.basicConfig(level=logging.DEBUG if args.debug else logging.INFO) rhasspy = Rhasspy.load(args.config) handle_program = args.handle_program pipeline = rhasspy.config.pipelines.get(args.pipeline) if not handle_program: assert pipeline is not None, f"No pipeline named {args.pipeline}" handle_program = pipeline.handle assert handle_program, "No handle program" for line in get_input(args): # Text handle_input = Transcript(text=line) handle_result = await handle(rhasspy, handle_program, handle_input) if handle_result is None: _LOGGER.warning("No result") continue _LOGGER.debug(handle_result) json.dump(handle_result.event().to_dict(), sys.stdout, ensure_ascii=False) def get_input(args: argparse.Namespace) -> Iterable[str]: """Get input from stdin or args.""" if args.text: for text in args.text: yield text else: if os.isatty(sys.stdin.fileno()): print("Reading input from stdin", file=sys.stderr) for line in sys.stdin: line = line.strip() if line: yield line if __name__ == "__main__": asyncio.run(main()) ================================================ FILE: bin/intent_recognize.py ================================================ #!/usr/bin/env python3 import argparse import asyncio import json import logging import os import sys from pathlib import Path from typing import Iterable from rhasspy3.core import Rhasspy from rhasspy3.intent import recognize _FILE = Path(__file__) _DIR = _FILE.parent _LOGGER = logging.getLogger(_FILE.stem) async def main() -> None: parser = argparse.ArgumentParser() parser.add_argument( "-c", "--config", default=_DIR.parent / "config", help="Configuration directory", ) parser.add_argument( "-p", "--pipeline", default="default", help="Name of pipeline to use" ) parser.add_argument( "--intent-program", help="Name of intent program to use (overrides pipeline)" ) parser.add_argument("text", nargs="*", help="Text to recognize") parser.add_argument( "--debug", action="store_true", help="Print DEBUG messages to console" ) args = parser.parse_args() logging.basicConfig(level=logging.DEBUG if args.debug else logging.INFO) rhasspy = Rhasspy.load(args.config) intent_program = args.intent_program pipeline = rhasspy.config.pipelines.get(args.pipeline) if not intent_program: assert pipeline is not None, f"No pipeline named {args.pipeline}" intent_program = pipeline.intent assert intent_program, "No intent program" for text in get_texts(args): intent_result = await recognize(rhasspy, intent_program, text) if intent_result is None: continue json.dump(intent_result.event().data, sys.stdout, ensure_ascii=False) print("", flush=True) def get_texts(args: argparse.Namespace) -> Iterable[str]: if args.text: for text in args.text: yield text else: if os.isatty(sys.stdin.fileno()): print("Reading text from stdin", file=sys.stderr) for line in sys.stdin: line = line.strip() if line: yield line if __name__ == "__main__": asyncio.run(main()) ================================================ FILE: bin/mic_adapter_raw.py ================================================ #!/usr/bin/env python3 """Reads raw audio chunks from stdin.""" import argparse import logging import shlex import subprocess import time from pathlib import Path from rhasspy3.audio import DEFAULT_SAMPLES_PER_CHUNK, AudioChunk, AudioStart from rhasspy3.event import write_event _FILE = Path(__file__) _DIR = _FILE.parent _LOGGER = logging.getLogger(_FILE.stem) def main() -> None: parser = argparse.ArgumentParser() parser.add_argument( "command", help="Command to run", ) parser.add_argument("--shell", action="store_true", help="Run command with shell") # parser.add_argument( "--samples-per-chunk", type=int, default=DEFAULT_SAMPLES_PER_CHUNK, help="Number of samples to read at a time from command", ) parser.add_argument( "--rate", type=int, required=True, help="Sample rate (hz)", ) parser.add_argument( "--width", type=int, required=True, help="Sample width bytes", ) parser.add_argument( "--channels", type=int, required=True, help="Sample channel count", ) # parser.add_argument( "--debug", action="store_true", help="Print DEBUG messages to console" ) args = parser.parse_args() logging.basicConfig(level=logging.DEBUG if args.debug else logging.INFO) bytes_per_chunk = args.samples_per_chunk * args.width * args.channels if args.shell: command = args.command else: command = shlex.split(args.command) proc = subprocess.Popen(command, stdout=subprocess.PIPE) with proc: assert proc.stdout is not None write_event( AudioStart( args.rate, args.width, args.channels, timestamp=time.monotonic_ns() ).event() ) while True: audio_bytes = proc.stdout.read(bytes_per_chunk) if not audio_bytes: break write_event( AudioChunk( args.rate, args.width, args.channels, audio_bytes, timestamp=time.monotonic_ns(), ).event() ) if __name__ == "__main__": main() ================================================ FILE: bin/mic_record_sample.py ================================================ #!/usr/bin/env python3 """Record a spoken audio sample to a WAV file.""" import argparse import asyncio import logging import wave from collections import deque from pathlib import Path from typing import Deque from rhasspy3.audio import AudioChunk from rhasspy3.core import Rhasspy from rhasspy3.event import async_read_event, async_write_event from rhasspy3.mic import DOMAIN as MIC_DOMAIN from rhasspy3.program import create_process from rhasspy3.vad import DOMAIN as VAD_DOMAIN from rhasspy3.vad import VoiceStarted, VoiceStopped _FILE = Path(__file__) _DIR = _FILE.parent _LOGGER = logging.getLogger(_FILE.stem) async def main() -> None: parser = argparse.ArgumentParser() parser.add_argument("wav_file", nargs="+", help="Path to WAV file(s) to write") # parser.add_argument( "-c", "--config", default=_DIR.parent / "config", help="Configuration directory", ) parser.add_argument( "-p", "--pipeline", default="default", help="Name of pipeline to use" ) parser.add_argument( "--mic-program", help="Name of mic program to use (overrides pipeline)" ) parser.add_argument( "--vad-program", help="Name of vad program to use (overrides pipeline)" ) # parser.add_argument( "--chunk-buffer-size", type=int, default=25, help="Audio chunks to buffer before start is known", ) parser.add_argument( "-b", "--keep-chunks-before", type=int, default=5, help="Audio chunks to keep before voice starts", ) parser.add_argument( "-a", "--keep-chunks-after", type=int, default=0, help="Audio chunks to keep after voice ends", ) # parser.add_argument( "--debug", action="store_true", help="Print DEBUG messages to console" ) args = parser.parse_args() logging.basicConfig(level=logging.DEBUG if args.debug else logging.INFO) rhasspy = Rhasspy.load(args.config) mic_program = args.mic_program vad_program = args.vad_program pipeline = rhasspy.config.pipelines.get(args.pipeline) if not mic_program: assert pipeline is not None, f"No pipeline named {args.pipeline}" mic_program = pipeline.mic assert mic_program, "No mic program" if not vad_program: assert pipeline is not None, f"No pipeline named {args.pipeline}" vad_program = pipeline.vad assert vad_program, "No vad program" for wav_path in args.wav_file: wav_file: wave.Wave_write = wave.open(wav_path, "wb") with wav_file: is_first_chunk = True # Audio kept before we get the event that the voice command started # at a timestep in the past. chunk_buffer: Deque[AudioChunk] = deque( maxlen=max(args.chunk_buffer_size, args.keep_chunks_before) ) async with ( await create_process(rhasspy, MIC_DOMAIN, mic_program) ) as mic_proc, ( await create_process(rhasspy, VAD_DOMAIN, vad_program) ) as vad_proc: assert mic_proc.stdout is not None assert vad_proc.stdin is not None assert vad_proc.stdout is not None _LOGGER.info("Recording %s", wav_path) mic_task = asyncio.create_task(async_read_event(mic_proc.stdout)) vad_task = asyncio.create_task(async_read_event(vad_proc.stdout)) pending = {mic_task, vad_task} before_command = True while True: done, pending = await asyncio.wait( pending, return_when=asyncio.FIRST_COMPLETED ) if mic_task in done: mic_event = mic_task.result() if mic_event is None: break # Process chunk if AudioChunk.is_type(mic_event.type): chunk = AudioChunk.from_event(mic_event) if is_first_chunk: _LOGGER.debug("Receiving audio") is_first_chunk = False wav_file.setframerate(chunk.rate) wav_file.setsampwidth(chunk.width) wav_file.setnchannels(chunk.channels) await async_write_event(mic_event, vad_proc.stdin) if before_command: chunk_buffer.append(chunk) else: wav_file.writeframes(chunk.audio) # Next chunk mic_task = asyncio.create_task( async_read_event(mic_proc.stdout) ) pending.add(mic_task) if vad_task in done: vad_event = vad_task.result() if vad_event is None: break if VoiceStarted.is_type(vad_event.type): if before_command: # Start of voice command voice_started = VoiceStarted.from_event(vad_event) if voice_started.timestamp is None: # Keep chunks before chunks_left = args.keep_chunks_before while chunk_buffer and (chunks_left > 0): chunk = chunk_buffer.popleft() wav_file.writeframes(chunk.audio) else: # Locate start chunk start_idx = 0 for i, chunk in enumerate(chunk_buffer): if (chunk.timestamp is not None) and ( chunk.timestamp >= voice_started.timestamp ): start_idx = i break # Back up by "keep chunks" and then write audio forward start_idx = max( 0, start_idx - args.keep_chunks_before ) for i, chunk in enumerate(chunk_buffer): if i >= start_idx: wav_file.writeframes(chunk.audio) chunk_buffer.clear() before_command = False _LOGGER.info("Speaking started") elif VoiceStopped.is_type(vad_event.type): # End of voice command _LOGGER.info("Speaking ended") break # Next VAD event vad_task = asyncio.create_task( async_read_event(vad_proc.stdout) ) pending.add(vad_task) # After chunks num_chunks_left = args.keep_chunks_after while num_chunks_left > 0: mic_event = await mic_task if mic_event is None: break if AudioChunk.is_type(mic_event.type): chunk = AudioChunk.from_event(mic_event) wav_file.writeframes(chunk.audio) num_chunks_left -= 1 if num_chunks_left > 0: mic_task = asyncio.create_task( async_read_event(mic_proc.stdout) ) if __name__ == "__main__": try: asyncio.run(main()) except KeyboardInterrupt: pass ================================================ FILE: bin/mic_test_energy.py ================================================ #!/usr/bin/env python3 """Prints microphone energy level to console for testing.""" import argparse import asyncio import audioop import logging from pathlib import Path from rhasspy3.audio import AudioChunk, AudioStop from rhasspy3.core import Rhasspy from rhasspy3.event import async_read_event from rhasspy3.mic import DOMAIN as MIC_DOMAIN from rhasspy3.program import create_process _FILE = Path(__file__) _DIR = _FILE.parent _LOGGER = logging.getLogger(_FILE.stem) async def main() -> None: parser = argparse.ArgumentParser() parser.add_argument( "-c", "--config", default=_DIR.parent / "config", help="Configuration directory", ) parser.add_argument( "-p", "--pipeline", default="default", help="Name of pipeline to use" ) parser.add_argument( "--mic-program", help="Name of mic program to use (overrides pipeline)" ) # parser.add_argument( "--levels", type=int, default=40, help="Number of levels to display" ) parser.add_argument( "--numeric", action="store_true", help="Print energy numeric values instead of showing level", ) # parser.add_argument( "--debug", action="store_true", help="Print DEBUG messages to console" ) args = parser.parse_args() logging.basicConfig(level=logging.DEBUG if args.debug else logging.INFO) rhasspy = Rhasspy.load(args.config) mic_program = args.mic_program pipeline = rhasspy.config.pipelines.get(args.pipeline) if not mic_program: assert pipeline is not None, f"No pipeline named {args.pipeline}" mic_program = pipeline.mic assert mic_program, "No mic program" max_energy = 0 async with (await create_process(rhasspy, MIC_DOMAIN, mic_program)) as mic_proc: assert mic_proc.stdout is not None while True: event = await async_read_event(mic_proc.stdout) if event is None: break if AudioChunk.is_type(event.type): chunk = AudioChunk.from_event(event) energy = -audioop.rms(chunk.audio, chunk.width) energy_bytes = bytes([energy & 0xFF, (energy >> 8) & 0xFF]) debiased_energy = audioop.rms( audioop.add( chunk.audio, energy_bytes * (len(chunk.audio) // chunk.width), chunk.width, ), chunk.width, ) max_energy = max(max_energy, debiased_energy) max_energy = max(1, max_energy) if args.numeric: # Print numbers print(debiased_energy, "/", max_energy) else: # Print graphic energy_level = int(args.levels * (debiased_energy / max_energy)) energy_level = max(0, energy_level) print( "\r", # We still use typewriters! "[", "*" * energy_level, " " * (args.levels - energy_level), "]", sep="", end="", ) elif AudioStop.is_type(event.type): break if __name__ == "__main__": try: asyncio.run(main()) except KeyboardInterrupt: pass ================================================ FILE: bin/pipeline_run.py ================================================ #!/usr/bin/env python3 """Run a pipeline all or part of the way.""" import argparse import asyncio import json import logging import sys from pathlib import Path from typing import IO, Optional, Union from rhasspy3.asr import Transcript from rhasspy3.audio import DEFAULT_SAMPLES_PER_CHUNK from rhasspy3.core import Rhasspy from rhasspy3.event import Event from rhasspy3.handle import Handled, NotHandled from rhasspy3.intent import Intent, NotRecognized from rhasspy3.pipeline import StopAfterDomain from rhasspy3.pipeline import run as run_pipeline from rhasspy3.wake import Detection _FILE = Path(__file__) _DIR = _FILE.parent _LOGGER = logging.getLogger(_FILE.stem) async def main() -> None: parser = argparse.ArgumentParser() parser.add_argument( "-c", "--config", default=_DIR.parent / "config", help="Configuration directory", ) parser.add_argument( "-p", "--pipeline", default="default", help="Name of pipeline to use" ) # parser.add_argument( "--stop-after", choices=[domain.value for domain in StopAfterDomain], help="Domain to stop pipeline after", ) # parser.add_argument( "--wake-name", help="Skip wake word detection and use name instead" ) parser.add_argument( "--asr-wav", help="Use WAV file for speech to text instead of mic input (skips wake)", ) parser.add_argument("--asr-text", help="Use text for asr transcript (skips wake)") parser.add_argument( "--intent-json", help="Use JSON for recognized intent (skips wake, asr)" ) parser.add_argument( "--handle-text", help="Use text for handle response (skips handle)" ) parser.add_argument( "--tts-wav", help="Play WAV file instead of text to speech response (skips tts)" ) parser.add_argument( "--samples-per-chunk", type=int, default=DEFAULT_SAMPLES_PER_CHUNK ) parser.add_argument("--asr-chunks-to-buffer", type=int, default=0) parser.add_argument("--loop", action="store_true", help="Keep pipeline running") parser.add_argument("--debug", action="store_true", help="Log DEBUG messages") args = parser.parse_args() logging.basicConfig(level=logging.DEBUG if args.debug else logging.INFO) wake_detection: Optional[Detection] = None if args.wake_name: # Wake word detection will be skipped wake_detection = Detection(name=args.wake_name) asr_wav_in: Optional[IO[bytes]] = None if args.asr_wav: # asr input will come from WAV file instead of mic asr_wav_in = open(args.asr_wav, "rb") asr_transcript: Optional[Transcript] = None if args.asr_text: # asr transcription will be skipped asr_transcript = Transcript(text=args.asr_text) intent_result: Optional[Union[Intent, NotRecognized]] = None if args.intent_json: # intent recognition will be skipped intent_event = Event.from_dict(json.loads(args.intent_json)) if Intent.is_type(intent_event.type): intent_result = Intent.from_event(intent_event) elif NotRecognized.is_type(intent_event.type): intent_result = NotRecognized.from_event(intent_event) handle_result: Optional[Union[Handled, NotHandled]] = None if args.handle_text: # text/intent handling will be skipped handle_result = Handled(text=args.handle_text) tts_wav_in: Optional[IO[bytes]] = None if args.tts_wav: # tts synthesis will be skipped tts_wav_in = open(args.tts_wav, "rb") rhasspy = Rhasspy.load(args.config) while True: pipeline_result = await run_pipeline( rhasspy, args.pipeline, samples_per_chunk=args.samples_per_chunk, asr_chunks_to_buffer=args.asr_chunks_to_buffer, wake_detection=wake_detection, asr_wav_in=asr_wav_in, asr_transcript=asr_transcript, intent_result=intent_result, handle_result=handle_result, tts_wav_in=tts_wav_in, stop_after=args.stop_after, ) json.dump(pipeline_result.to_dict(), sys.stdout, ensure_ascii=False) print("") if not args.loop: break if __name__ == "__main__": try: asyncio.run(main()) except KeyboardInterrupt: pass ================================================ FILE: bin/program_download.py ================================================ #!/usr/bin/env python3 import argparse import logging import shlex import string import subprocess from pathlib import Path from rhasspy3.core import Rhasspy _FILE = Path(__file__) _DIR = _FILE.parent _LOGGER = logging.getLogger(_FILE.stem) def main() -> None: parser = argparse.ArgumentParser() parser.add_argument("domain") parser.add_argument("program") parser.add_argument("model") parser.add_argument( "-c", "--config", default=_DIR.parent / "config", help="Configuration directory", ) parser.add_argument( "--debug", action="store_true", help="Print DEBUG messages to console" ) args = parser.parse_args() logging.basicConfig(level=logging.DEBUG if args.debug else logging.INFO) rhasspy = Rhasspy.load(args.config) program_config = rhasspy.config.programs.get(args.domain, {}).get(args.program) assert program_config is not None, f"No config for {args.domain} {args.program}" install = program_config.install assert install is not None, f"No install config for {args.domain} {args.program}" downloads = install.downloads assert downloads is not None, f"No downloads for {args.domain} {args.program}" model = downloads.get(args.model) assert ( model is not None ), f"No download named {args.model} for {args.domain} {args.program}" program_dir = rhasspy.programs_dir / args.domain / args.program data_dir = rhasspy.data_dir / args.domain / args.program default_mapping = { "program_dir": str(program_dir.absolute()), "data_dir": str(data_dir.absolute()), "model": str(args.model), } # Check if already installed if model.check_file is not None: check_file = Path( string.Template(model.check_file).safe_substitute(default_mapping) ) if check_file.exists(): _LOGGER.info("Installed: %s", check_file) return download = install.download assert download is not None, f"No download config for {args.domain} {args.program}" download_command = string.Template(download.command).safe_substitute( default_mapping ) _LOGGER.info(download_command) cwd = program_dir if program_dir.exists() else rhasspy.config_dir if download.shell: subprocess.check_call(download_command, shell=True, cwd=cwd) else: subprocess.check_call(shlex.split(download_command), cwd=cwd) if __name__ == "__main__": main() ================================================ FILE: bin/program_install.py ================================================ #!/usr/bin/env python3 import argparse import logging import shlex import string import subprocess from pathlib import Path from rhasspy3.core import Rhasspy _FILE = Path(__file__) _DIR = _FILE.parent _LOGGER = logging.getLogger(_FILE.stem) def main() -> None: parser = argparse.ArgumentParser() parser.add_argument("domain") parser.add_argument("program") parser.add_argument( "-c", "--config", default=_DIR.parent / "config", help="Configuration directory", ) parser.add_argument( "--debug", action="store_true", help="Print DEBUG messages to console" ) args = parser.parse_args() logging.basicConfig(level=logging.DEBUG if args.debug else logging.INFO) rhasspy = Rhasspy.load(args.config) program_config = rhasspy.config.programs.get(args.domain, {}).get(args.program) assert program_config is not None, f"No config for {args.domain} {args.program}" install = program_config.install assert install is not None, f"No install config for {args.domain} {args.program}" program_dir = rhasspy.programs_dir / args.domain / args.program data_dir = rhasspy.data_dir / args.domain / args.program default_mapping = { "program_dir": str(program_dir.absolute()), "data_dir": str(data_dir.absolute()), } # Check if already installed if install.check_file is not None: check_file = Path( string.Template(install.check_file).safe_substitute(default_mapping) ) if check_file.exists(): _LOGGER.info("Installed: %s", check_file) return install_command = string.Template(install.command).safe_substitute(default_mapping) _LOGGER.debug(install_command) cwd = program_dir if program_dir.exists() else rhasspy.config_dir if install.shell: subprocess.check_call(install_command, shell=True, cwd=cwd) else: subprocess.check_call(shlex.split(install_command), cwd=cwd) if __name__ == "__main__": main() ================================================ FILE: bin/satellite_run.py ================================================ #!/usr/bin/env python3 """Run satellite loop.""" import argparse import asyncio import logging from collections import deque from pathlib import Path from typing import Deque, List from rhasspy3.audio import AudioChunk, AudioStop from rhasspy3.core import Rhasspy from rhasspy3.event import Event, async_read_event, async_write_event from rhasspy3.mic import DOMAIN as MIC_DOMAIN from rhasspy3.program import create_process from rhasspy3.remote import DOMAIN as REMOTE_DOMAIN from rhasspy3.snd import DOMAIN as SND_DOMAIN from rhasspy3.snd import Played from rhasspy3.wake import detect _FILE = Path(__file__) _DIR = _FILE.parent _LOGGER = logging.getLogger(_FILE.stem) async def main() -> None: parser = argparse.ArgumentParser() parser.add_argument( "-c", "--config", default=_DIR.parent / "config", help="Configuration directory", ) parser.add_argument( "-s", "--satellite", default="default", help="Name of satellite to use" ) # parser.add_argument( "--mic-program", help="Program to use for mic input (overrides satellite)", ) parser.add_argument( "--wake-program", help="Program to use for wake word detection (overiddes satellite)", ) parser.add_argument( "--remote-program", help="Program to use for remote communication with base station (overrides satellite)", ) parser.add_argument( "--snd-program", help="Program to use for audio output (overrides satellite)", ) # parser.add_argument("--asr-chunks-to-buffer", type=int, default=0) # parser.add_argument("--loop", action="store_true", help="Keep satellite running") parser.add_argument("--debug", action="store_true", help="Log DEBUG messages") args = parser.parse_args() logging.basicConfig(level=logging.DEBUG if args.debug else logging.INFO) rhasspy = Rhasspy.load(args.config) mic_program = args.mic_program wake_program = args.wake_program remote_program = args.remote_program snd_program = args.snd_program satellite = rhasspy.config.satellites.get(args.satellite) if not mic_program: assert satellite is not None, f"No satellite named {args.satellite}" mic_program = satellite.mic assert mic_program, "No mic program" if not wake_program: assert satellite is not None, f"No satellite named {args.satellite}" wake_program = satellite.wake assert wake_program, "No wake program" if not remote_program: assert satellite is not None, f"No satellite named {args.satellite}" remote_program = satellite.remote assert remote_program, "No remote program" if not snd_program: assert satellite is not None, f"No satellite named {args.satellite}" snd_program = satellite.snd assert snd_program, "No snd program" while True: chunk_buffer: Deque[Event] = deque(maxlen=args.asr_chunks_to_buffer) snd_buffer: List[Event] = [] async with (await create_process(rhasspy, MIC_DOMAIN, mic_program)) as mic_proc: assert mic_proc.stdout is not None detection = await detect( rhasspy, wake_program, mic_proc.stdout, chunk_buffer ) if detection is None: continue async with ( await create_process(rhasspy, REMOTE_DOMAIN, remote_program) ) as remote_proc: assert remote_proc.stdin is not None assert remote_proc.stdout is not None while chunk_buffer: await async_write_event(chunk_buffer.pop(), remote_proc.stdin) mic_task = asyncio.create_task(async_read_event(mic_proc.stdout)) remote_task = asyncio.create_task(async_read_event(remote_proc.stdout)) pending = {mic_task, remote_task} try: # Stream to remote until audio is received while True: done, pending = await asyncio.wait( pending, return_when=asyncio.FIRST_COMPLETED ) if mic_task in done: mic_event = mic_task.result() if mic_event is None: break if AudioChunk.is_type(mic_event.type): await async_write_event(mic_event, remote_proc.stdin) mic_task = asyncio.create_task( async_read_event(mic_proc.stdout) ) pending.add(mic_task) if remote_task in done: remote_event = remote_task.result() if remote_event is not None: snd_buffer.append(remote_event) for task in pending: task.cancel() break # Output audio async with ( await create_process(rhasspy, SND_DOMAIN, snd_program) ) as snd_proc: assert snd_proc.stdin is not None assert snd_proc.stdout is not None for remote_event in snd_buffer: if AudioChunk.is_type(remote_event.type): await async_write_event(remote_event, snd_proc.stdin) elif AudioStop.is_type(remote_event.type): # Unexpected, but it could happen continue while True: remote_event = await async_read_event(remote_proc.stdout) if remote_event is None: break if AudioChunk.is_type(remote_event.type): await async_write_event(remote_event, snd_proc.stdin) elif AudioStop.is_type(remote_event.type): await async_write_event(remote_event, snd_proc.stdin) break # Wait for audio to finish playing while True: snd_event = await async_read_event(snd_proc.stdout) if snd_event is None: break if Played.is_type(snd_event.type): break except Exception: _LOGGER.exception( "Unexpected error communicating with remote base station" ) if not args.loop: break if __name__ == "__main__": try: asyncio.run(main()) except KeyboardInterrupt: pass ================================================ FILE: bin/server_run.py ================================================ #!/usr/bin/env python3 import argparse import logging import os import shlex import string import subprocess import sys from pathlib import Path from typing import List, Union from rhasspy3.core import Rhasspy from rhasspy3.util import merge_dict _FILE = Path(__file__) _DIR = _FILE.parent _LOGGER = logging.getLogger(_FILE.stem) def main() -> None: parser = argparse.ArgumentParser() parser.add_argument( "-c", "--config", default=_DIR.parent / "config", help="Configuration directory", ) parser.add_argument("domain", help="Domain of server (asr, tts, etc.)") parser.add_argument("server", help="Name of server to run") parser.add_argument("--debug", action="store_true", help="Log DEBUG messages") args = parser.parse_args() logging.basicConfig(level=logging.DEBUG if args.debug else logging.INFO) rhasspy = Rhasspy.load(args.config) server = rhasspy.config.servers[args.domain][args.server] program_dir = rhasspy.programs_dir / args.domain / args.server data_dir = rhasspy.data_dir / args.domain / args.server # ${variables} available within command and template args default_mapping = { "program_dir": str(program_dir.absolute()), "data_dir": str(data_dir.absolute()), } command_str = server.command command_mapping = dict(default_mapping) if server.template_args: # Substitute within template args args_mapping = dict(server.template_args) for arg_name, arg_str in args_mapping.items(): if not isinstance(arg_str, str): continue arg_template = string.Template(arg_str) args_mapping[arg_name] = arg_template.safe_substitute(default_mapping) merge_dict(command_mapping, args_mapping) command_template = string.Template(command_str) command_str = command_template.safe_substitute(command_mapping) env = dict(os.environ) # Add rhasspy3/bin to $PATH env["PATH"] = f'{rhasspy.base_dir}/bin:${env["PATH"]}' # Ensure stdout is flushed for Python programs env["PYTHONUNBUFFERED"] = "1" server_dir = rhasspy.programs_dir / args.domain / args.server cwd = server_dir if server_dir.is_dir() else rhasspy.base_dir if server.shell: command: Union[str, List[str]] = command_str else: command = shlex.split(command_str) _LOGGER.debug(command) proc = subprocess.Popen(command, shell=server.shell, cwd=cwd, env=env) with proc: sys.exit(proc.wait()) if __name__ == "__main__": try: main() except KeyboardInterrupt: pass ================================================ FILE: bin/snd_adapter_raw.py ================================================ #!/usr/bin/env python3 """Play audio through a command that accepts raw PCM.""" import argparse import logging import shlex import subprocess from pathlib import Path from rhasspy3.audio import ( DEFAULT_OUT_CHANNELS, DEFAULT_OUT_RATE, DEFAULT_OUT_WIDTH, AudioChunk, AudioChunkConverter, AudioStop, ) from rhasspy3.event import read_event, write_event from rhasspy3.snd import Played _FILE = Path(__file__) _DIR = _FILE.parent _LOGGER = logging.getLogger(_FILE.stem) def main() -> None: parser = argparse.ArgumentParser() parser.add_argument( "command", help="Command to run", ) parser.add_argument( "--rate", type=int, default=DEFAULT_OUT_RATE, help="Sample rate (hertz)" ) parser.add_argument( "--width", type=int, default=DEFAULT_OUT_WIDTH, help="Sample width (bytes)" ) parser.add_argument( "--channels", type=int, default=DEFAULT_OUT_CHANNELS, help="Sample channel count", ) parser.add_argument("--shell", action="store_true", help="Run command with shell") parser.add_argument( "--debug", action="store_true", help="Print DEBUG messages to console" ) args = parser.parse_args() logging.basicConfig(level=logging.DEBUG if args.debug else logging.INFO) if args.shell: command = args.command else: command = shlex.split(args.command) try: proc = subprocess.Popen( command, stdin=subprocess.PIPE, stdout=subprocess.DEVNULL ) assert proc.stdin is not None converter = AudioChunkConverter(args.rate, args.width, args.channels) with proc: while True: event = read_event() if event is None: break if AudioChunk.is_type(event.type): chunk = AudioChunk.from_event(event) chunk = converter.convert(chunk) proc.stdin.write(chunk.audio) proc.stdin.flush() elif AudioStop.is_type(event.type): break finally: write_event(Played().event()) if __name__ == "__main__": main() ================================================ FILE: bin/snd_play.py ================================================ #!/usr/bin/env python3 import argparse import asyncio import logging import os import sys from pathlib import Path from rhasspy3.audio import DEFAULT_SAMPLES_PER_CHUNK from rhasspy3.core import Rhasspy from rhasspy3.snd import play _FILE = Path(__file__) _DIR = _FILE.parent _LOGGER = logging.getLogger(_FILE.stem) async def main() -> None: parser = argparse.ArgumentParser() parser.add_argument("wav_file", nargs="*", help="Path to WAV file(s) to play") parser.add_argument( "-c", "--config", default=_DIR.parent / "config", help="Configuration directory", ) parser.add_argument("-p", "--pipeline", default="default", help="Name of pipeline") parser.add_argument("--snd-program", help="Audio output program name") parser.add_argument( "--samples-per-chunk", type=int, default=DEFAULT_SAMPLES_PER_CHUNK, help="Samples to send to snd program at a time", ) parser.add_argument( "--debug", action="store_true", help="Print DEBUG messages to console" ) args = parser.parse_args() logging.basicConfig(level=logging.DEBUG if args.debug else logging.INFO) rhasspy = Rhasspy.load(args.config) snd_program = args.snd_program pipeline = rhasspy.config.pipelines.get(args.pipeline) if not snd_program: assert pipeline is not None, f"No pipeline named {args.pipeline}" snd_program = pipeline.snd assert snd_program, "No snd program" if args.wav_file: for wav_path in args.wav_file: with open(wav_path, "rb") as wav_file: await play(rhasspy, snd_program, wav_file, args.samples_per_chunk) else: if os.isatty(sys.stdin.fileno()): print("Reading WAV data from stdin", file=sys.stderr) await play( rhasspy, snd_program, sys.stdin.buffer, args.samples_per_chunk, ) if __name__ == "__main__": asyncio.run(main()) ================================================ FILE: bin/tts_adapter_http.py ================================================ #!/usr/bin/env python3 import argparse import logging import wave from pathlib import Path from urllib.parse import urlencode from urllib.request import urlopen from rhasspy3.audio import DEFAULT_SAMPLES_PER_CHUNK, AudioChunk, AudioStart, AudioStop from rhasspy3.event import read_event, write_event from rhasspy3.tts import Synthesize _FILE = Path(__file__) _DIR = _FILE.parent _LOGGER = logging.getLogger(_FILE.stem) def main(): parser = argparse.ArgumentParser() parser.add_argument( "url", help="URL of API endpoint", ) parser.add_argument( "--param", nargs=2, action="append", metavar=("name", "value"), help="Name/value of query parameter", ) # parser.add_argument( "--samples-per-chunk", type=int, default=DEFAULT_SAMPLES_PER_CHUNK ) # parser.add_argument( "--debug", action="store_true", help="Print DEBUG messages to console" ) args = parser.parse_args() logging.basicConfig(level=logging.DEBUG if args.debug else logging.INFO) params = {} if args.param: for key, value in params.items(): # Don't include empty parameters if value: params[key] = value try: while True: event = read_event() if event is None: break if Synthesize.is_type(event.type): synthesize = Synthesize.from_event(event) params["text"] = synthesize.text url = args.url + "?" + urlencode(params) with urlopen(url) as response: with wave.open(response, "rb") as wav_file: rate = wav_file.getframerate() width = wav_file.getsampwidth() channels = wav_file.getnchannels() num_frames = wav_file.getnframes() audio_bytes = wav_file.readframes(num_frames) bytes_per_chunk = args.samples_per_chunk * width timestamp = 0 write_event( AudioStart(rate, width, channels, timestamp=timestamp).event() ) while audio_bytes: chunk = AudioChunk( rate, width, channels, audio_bytes[:bytes_per_chunk], timestamp=timestamp, ) write_event(chunk.event()) timestamp += chunk.milliseconds audio_bytes = audio_bytes[bytes_per_chunk:] write_event(AudioStop(timestamp=timestamp).event()) except KeyboardInterrupt: pass if __name__ == "__main__": main() ================================================ FILE: bin/tts_adapter_text2wav.py ================================================ #!/usr/bin/env python3 """ Runs a text to speech command that returns WAV audio on stdout or in a temp file. """ import argparse import io import logging import shlex import subprocess import tempfile import wave from pathlib import Path from rhasspy3.audio import DEFAULT_SAMPLES_PER_CHUNK, AudioChunk, AudioStart, AudioStop from rhasspy3.event import read_event, write_event from rhasspy3.tts import Synthesize _FILE = Path(__file__) _DIR = _FILE.parent _LOGGER = logging.getLogger(_FILE.stem) def main(): parser = argparse.ArgumentParser() parser.add_argument( "command", help="Command to run", ) parser.add_argument( "--temp_file", action="store_true", help="Command has {temp_file} and will write output to it", ) parser.add_argument( "--text", action="store_true", help="Command has {text} argument", ) # parser.add_argument( "--samples-per-chunk", type=int, default=DEFAULT_SAMPLES_PER_CHUNK ) # parser.add_argument( "--debug", action="store_true", help="Print DEBUG messages to console" ) args = parser.parse_args() logging.basicConfig(level=logging.DEBUG if args.debug else logging.INFO) try: while True: event = read_event() if event is None: break if Synthesize.is_type(event.type): synthesize = Synthesize.from_event(event) wav_bytes = text_to_wav(args, synthesize.text) with io.BytesIO(wav_bytes) as wav_io: with wave.open(wav_io, "rb") as wav_file: rate = wav_file.getframerate() width = wav_file.getsampwidth() channels = wav_file.getnchannels() num_frames = wav_file.getnframes() audio_bytes = wav_file.readframes(num_frames) bytes_per_chunk = args.samples_per_chunk * width timestamp = 0 write_event( AudioStart(rate, width, channels, timestamp=timestamp).event() ) while audio_bytes: chunk = AudioChunk( rate, width, channels, audio_bytes[:bytes_per_chunk], timestamp=timestamp, ) write_event(chunk.event()) timestamp += chunk.milliseconds audio_bytes = audio_bytes[bytes_per_chunk:] write_event(AudioStop(timestamp=timestamp).event()) except KeyboardInterrupt: pass def text_to_wav(args: argparse.Namespace, text: str) -> bytes: command_str = args.command format_args = {} if args.text: format_args["text"] = text text = "" # Pass as arg instead if args.temp_file: with tempfile.NamedTemporaryFile(mode="wb", suffix=".wav") as wav_file: format_args["temp_file"] = wav_file.name command_str = command_str.format(**format_args) command = shlex.split(command_str) # Send stdout to devnull so it doesn't interfere with our events subprocess.run( command, check=True, stdout=subprocess.DEVNULL, input=text.encode() ) wav_file.seek(0) return Path(wav_file.name).read_bytes() else: command_str = command_str.format(**format_args) command = shlex.split(command_str) return subprocess.check_output(command, input=text.encode()) if __name__ == "__main__": main() ================================================ FILE: bin/tts_speak.py ================================================ #!/usr/bin/env python3 """Synthesize and speak audio.""" import argparse import asyncio import io import json import logging import os import sys from pathlib import Path from rhasspy3.core import Rhasspy from rhasspy3.snd import play from rhasspy3.tts import synthesize _FILE = Path(__file__) _DIR = _FILE.parent _LOGGER = logging.getLogger(_FILE.stem) async def main() -> None: parser = argparse.ArgumentParser() parser.add_argument("text", nargs="*", help="Text to speak (default: stdin)") parser.add_argument( "-c", "--config", default=_DIR.parent / "config", help="Configuration directory", ) parser.add_argument("-p", "--pipeline", default="default", help="Name of pipeline") parser.add_argument("--tts-program", help="TTS program name") parser.add_argument("--snd-program", help="Audio output program name") parser.add_argument( "--samples-per-chunk", type=int, default=1024, help="Samples to send to snd program at a time", ) parser.add_argument( "--debug", action="store_true", help="Print DEBUG messages to console" ) args = parser.parse_args() logging.basicConfig(level=logging.DEBUG if args.debug else logging.INFO) rhasspy = Rhasspy.load(args.config) tts_program = args.tts_program snd_program = args.snd_program pipeline = rhasspy.config.pipelines.get(args.pipeline) if not tts_program: assert pipeline is not None, f"No pipeline named {args.pipeline}" tts_program = pipeline.tts if not snd_program: assert pipeline is not None, f"No pipeline named {args.pipeline}" snd_program = pipeline.snd assert tts_program, "No tts program" assert snd_program, "No snd program" if args.text: lines = args.text else: lines = sys.stdin if os.isatty(sys.stdin.fileno()): print("Reading text from stdin", file=sys.stderr) for line in lines: line = line.strip() if not line: continue with io.BytesIO() as wav_io: await synthesize(rhasspy, tts_program, line, wav_io) wav_io.seek(0) play_result = await play( rhasspy, snd_program, wav_io, args.samples_per_chunk ) if play_result is not None: json.dump(play_result.event().to_dict(), sys.stdout, ensure_ascii=False) print("", flush=True) if __name__ == "__main__": asyncio.run(main()) ================================================ FILE: bin/tts_synthesize.py ================================================ #!/usr/bin/env python3 """Synthesize WAV audio from text.""" import argparse import asyncio import io import logging import sys from pathlib import Path from rhasspy3.core import Rhasspy from rhasspy3.tts import synthesize _FILE = Path(__file__) _DIR = _FILE.parent _LOGGER = logging.getLogger(_FILE.stem) async def main() -> None: parser = argparse.ArgumentParser() parser.add_argument("text", help="Text to speak") parser.add_argument( "-c", "--config", default=_DIR.parent / "config", help="Configuration directory", ) parser.add_argument("-p", "--pipeline", default="default", help="Name of pipeline") parser.add_argument("--tts-program", help="TTS program name") parser.add_argument("-f", "--file", help="Write to file instead of stdout") parser.add_argument( "--debug", action="store_true", help="Print DEBUG messages to console" ) args = parser.parse_args() logging.basicConfig(level=logging.DEBUG if args.debug else logging.INFO) rhasspy = Rhasspy.load(args.config) tts_program = args.tts_program pipeline = rhasspy.config.pipelines.get(args.pipeline) if not tts_program: assert pipeline is not None, f"No pipeline named {args.pipeline}" tts_program = pipeline.tts assert tts_program, "No tts program" with io.BytesIO() as wav_out: await synthesize(rhasspy, tts_program, args.text, wav_out) wav_bytes = wav_out.getvalue() if args.file: output_path = Path(args.file) output_path.parent.mkdir(parents=True, exist_ok=True) output_path.write_bytes(wav_bytes) else: sys.stdout.buffer.write(wav_bytes) if __name__ == "__main__": asyncio.run(main()) ================================================ FILE: bin/vad_adapter_raw.py ================================================ #!/usr/bin/env python3 """Voice activity detection programs that accept raw PCM audio and print a speech probability for each chunk.""" import argparse import logging import shlex import subprocess import time from pathlib import Path from typing import Optional from rhasspy3.audio import AudioChunk, AudioChunkConverter, AudioStop from rhasspy3.event import read_event, write_event from rhasspy3.vad import Segmenter, VoiceStarted, VoiceStopped _FILE = Path(__file__) _DIR = _FILE.parent _LOGGER = logging.getLogger(_FILE.stem) def main() -> None: parser = argparse.ArgumentParser() parser.add_argument( "command", help="Command to run", ) # parser.add_argument( "--rate", type=int, required=True, help="Sample rate (hz)", ) parser.add_argument( "--width", type=int, required=True, help="Sample width bytes", ) parser.add_argument( "--channels", type=int, required=True, help="Sample channel count", ) parser.add_argument( "--samples-per-chunk", required=True, type=int, help="Samples to send to command at a time", ) # parser.add_argument( "--threshold", type=float, default=0.5, help="Speech probability threshold (0-1)", ) parser.add_argument( "--speech-seconds", type=float, default=0.3, ) parser.add_argument( "--silence-seconds", type=float, default=0.5, ) parser.add_argument( "--timeout-seconds", type=float, default=15.0, ) parser.add_argument( "--reset-seconds", type=float, default=1, ) # parser.add_argument( "--debug", action="store_true", help="Print DEBUG messages to console" ) args = parser.parse_args() logging.basicConfig(level=logging.DEBUG if args.debug else logging.INFO) bytes_per_chunk = args.samples_per_chunk * args.width * args.channels seconds_per_chunk = args.samples_per_chunk / args.rate command = shlex.split(args.command) with subprocess.Popen( command, stdin=subprocess.PIPE, stdout=subprocess.PIPE ) as proc: assert proc.stdin is not None assert proc.stdout is not None segmenter = Segmenter( args.speech_seconds, args.silence_seconds, args.timeout_seconds, args.reset_seconds, ) converter = AudioChunkConverter(args.rate, args.width, args.channels) audio_bytes = bytes() is_first_audio = True sent_started = False sent_stopped = False last_stop_timestamp: Optional[int] = None while True: event = read_event() if event is None: break if AudioChunk.is_type(event.type): if is_first_audio: _LOGGER.debug("Receiving audio") is_first_audio = False chunk = AudioChunk.from_event(event) chunk = converter.convert(chunk) audio_bytes += chunk.audio timestamp = ( time.monotonic_ns() if chunk.timestamp is None else chunk.timestamp ) last_stop_timestamp = timestamp + chunk.milliseconds # Handle uneven chunk sizes while len(audio_bytes) >= bytes_per_chunk: chunk_bytes = audio_bytes[:bytes_per_chunk] proc.stdin.write(chunk_bytes) proc.stdin.flush() line = proc.stdout.readline().decode() if line: speech_probability = float(line) is_speech = speech_probability > args.threshold segmenter.process( chunk=chunk_bytes, chunk_seconds=seconds_per_chunk, is_speech=is_speech, timestamp=timestamp, ) if (not sent_started) and segmenter.started: _LOGGER.debug("Voice started") write_event( VoiceStarted( timestamp=segmenter.start_timestamp ).event() ) sent_started = True if (not sent_stopped) and segmenter.stopped: if segmenter.timeout: _LOGGER.info("Voice timeout") else: _LOGGER.debug("Voice stopped") write_event( VoiceStopped(timestamp=segmenter.stop_timestamp).event() ) sent_stopped = True audio_bytes = audio_bytes[bytes_per_chunk:] elif AudioStop.is_type(event.type): _LOGGER.debug("Audio stopped") if not sent_stopped: write_event(VoiceStopped(timestamp=last_stop_timestamp).event()) sent_stopped = True proc.stdin.close() break if __name__ == "__main__": main() ================================================ FILE: bin/vad_segment_wav.py ================================================ #!/usr/bin/env python3 """Prints voice start/stop in WAV file.""" import argparse import asyncio import io import json import logging import sys import time import wave from pathlib import Path from typing import Iterable, Optional from rhasspy3.audio import DEFAULT_SAMPLES_PER_CHUNK, AudioChunk, AudioStart, AudioStop from rhasspy3.core import Rhasspy from rhasspy3.event import async_read_event, async_write_event from rhasspy3.program import create_process from rhasspy3.vad import DOMAIN, VoiceStarted, VoiceStopped _FILE = Path(__file__) _DIR = _FILE.parent _LOGGER = logging.getLogger(_FILE.stem) async def main() -> None: parser = argparse.ArgumentParser() parser.add_argument( "-c", "--config", default=_DIR.parent / "config", help="Configuration directory", ) parser.add_argument( "-p", "--pipeline", default="default", help="Name of pipeline to use" ) parser.add_argument( "--vad-program", help="Name of vad program to use (overrides pipeline)" ) parser.add_argument( "--samples-per-chunk", type=int, default=DEFAULT_SAMPLES_PER_CHUNK, help="Samples to process at a time", ) parser.add_argument("wav", nargs="*", help="Path(s) to WAV file(s)") # parser.add_argument( "--debug", action="store_true", help="Print DEBUG messages to console" ) args = parser.parse_args() logging.basicConfig(level=logging.DEBUG if args.debug else logging.INFO) rhasspy = Rhasspy.load(args.config) vad_program = args.vad_program pipeline = rhasspy.config.pipelines.get(args.pipeline) if not vad_program: assert pipeline is not None, f"No pipeline named {args.pipeline}" vad_program = pipeline.vad assert vad_program, "No vad program" async with (await create_process(rhasspy, DOMAIN, vad_program)) as vad_proc: assert vad_proc.stdin is not None assert vad_proc.stdout is not None for wav_bytes in get_wav_bytes(args): with io.BytesIO(wav_bytes) as wav_io: with wave.open(wav_io, "rb") as wav_file: rate = wav_file.getframerate() width = wav_file.getsampwidth() channels = wav_file.getnchannels() timestamp = 0 await async_write_event( AudioStart(rate, width, channels, timestamp=timestamp).event(), vad_proc.stdin, ) audio_bytes = wav_file.readframes(args.samples_per_chunk) while audio_bytes: chunk = AudioChunk( rate, width, channels, audio_bytes, timestamp=timestamp ) await async_write_event( chunk.event(), vad_proc.stdin, ) timestamp += chunk.milliseconds audio_bytes = wav_file.readframes(args.samples_per_chunk) await async_write_event( AudioStop(timestamp=timestamp).event(), vad_proc.stdin ) voice_started: Optional[VoiceStarted] = None voice_stopped: Optional[VoiceStopped] = None while True: event = await async_read_event(vad_proc.stdout) if event is None: break if VoiceStarted.is_type(event.type): voice_started = VoiceStarted.from_event(event) if voice_started.timestamp is None: voice_started.timestamp = time.monotonic_ns() _LOGGER.debug(voice_started) json.dump( voice_started.event().to_dict(), sys.stdout, ensure_ascii=False ) print("", flush=True) elif VoiceStopped.is_type(event.type): voice_stopped = VoiceStopped.from_event(event) if voice_stopped.timestamp is None: voice_stopped.timestamp = time.monotonic_ns() _LOGGER.debug(voice_stopped) json.dump( voice_stopped.event().to_dict(), sys.stdout, ensure_ascii=False ) print("", flush=True) break def get_wav_bytes(args: argparse.Namespace) -> Iterable[bytes]: """Yields WAV audio from stdin or args.""" if args.wav: # WAV file path(s) for wav_path in args.wav: with open(wav_path, "rb") as wav_file: yield wav_file.read() else: # WAV on stdin yield sys.stdin.buffer.read() if __name__ == "__main__": asyncio.run(main()) ================================================ FILE: bin/wake_adapter_raw.py ================================================ #!/usr/bin/env python3 """Wake word detection with a command that accepts raw PCM audio and prints a line for each detection.""" import argparse import logging import shlex import subprocess import threading import time from dataclasses import dataclass from pathlib import Path from typing import IO from rhasspy3.audio import AudioChunk, AudioStop from rhasspy3.event import read_event, write_event from rhasspy3.wake import Detection, NotDetected _FILE = Path(__file__) _DIR = _FILE.parent _LOGGER = logging.getLogger(_FILE.stem) @dataclass class State: timestamp: int = 0 detected: bool = False def main() -> None: parser = argparse.ArgumentParser() parser.add_argument( "command", help="Command to run", ) parser.add_argument( "--debug", action="store_true", help="Print DEBUG messages to console" ) args = parser.parse_args() logging.basicConfig(level=logging.DEBUG if args.debug else logging.INFO) command = shlex.split(args.command) with subprocess.Popen( command, stdin=subprocess.PIPE, stdout=subprocess.PIPE ) as proc: assert proc.stdin is not None assert proc.stdout is not None state = State() threading.Thread(target=write_proc, args=(proc.stdout, state)).start() while not state.detected: event = read_event() if event is None: break if AudioChunk.is_type(event.type): chunk = AudioChunk.from_event(event) state.timestamp = ( chunk.timestamp if chunk.timestamp is not None else time.monotonic_ns() ) proc.stdin.write(chunk.audio) proc.stdin.flush() elif AudioStop.is_type(event.type): proc.stdin.close() break if not state.detected: write_event(NotDetected().event()) def write_proc(reader: IO[bytes], state: State): try: for line in reader: line = line.strip() if line: write_event( Detection(name=line.decode(), timestamp=state.timestamp).event() ) state.detected = True break except Exception: _LOGGER.exception("Unexpected error in write thread") if __name__ == "__main__": main() ================================================ FILE: bin/wake_detect.py ================================================ #!/usr/bin/env python3 """Wait for wake word to be detected.""" import argparse import asyncio import json import logging import sys from pathlib import Path from rhasspy3.core import Rhasspy from rhasspy3.mic import DOMAIN as MIC_DOMAIN from rhasspy3.program import create_process from rhasspy3.wake import detect _FILE = Path(__file__) _DIR = _FILE.parent _LOGGER = logging.getLogger(_FILE.stem) async def main() -> None: parser = argparse.ArgumentParser() parser.add_argument( "-c", "--config", default=_DIR.parent / "config", help="Configuration directory", ) parser.add_argument( "-p", "--pipeline", default="default", help="Name of pipeline to use" ) parser.add_argument( "--mic-program", help="Name of mic program to use (overrides pipeline)" ) parser.add_argument( "--wake-program", help="Name of wake program to use (overrides pipeline)" ) # parser.add_argument("--loop", action="store_true", help="Keep detecting wake words") parser.add_argument( "--debug", action="store_true", help="Print DEBUG messages to console" ) args = parser.parse_args() logging.basicConfig(level=logging.DEBUG if args.debug else logging.INFO) rhasspy = Rhasspy.load(args.config) mic_program = args.mic_program wake_program = args.wake_program pipeline = rhasspy.config.pipelines.get(args.pipeline) if not mic_program: assert pipeline is not None, f"No pipeline named {args.pipeline}" mic_program = pipeline.mic assert mic_program, "No mic program" _LOGGER.debug("mic program: %s", mic_program) if not wake_program: assert pipeline is not None, f"No pipeline named {args.pipeline}" wake_program = pipeline.wake assert wake_program, "No wake program" _LOGGER.debug("wake program: %s", wake_program) # Detect wake word while True: async with (await create_process(rhasspy, MIC_DOMAIN, mic_program)) as mic_proc: assert mic_proc.stdout is not None _LOGGER.debug("Detecting wake word") detection = await detect(rhasspy, wake_program, mic_proc.stdout) if detection is not None: json.dump(detection.event().to_dict(), sys.stdout, ensure_ascii=False) print("", flush=True) if not args.loop: break if __name__ == "__main__": try: asyncio.run(main()) except KeyboardInterrupt: pass ================================================ FILE: docs/README.md ================================================ # Rhasspy 3 * [Tutorial](tutorial.md) * [Domains](domains.md) * [Wyoming Protcol](wyoming.md) * [Adapters](adapters.md) ================================================ FILE: docs/adapters.md ================================================ # Adapters Scripts in `bin/`: * `asr_adapter_raw2text.py` * Raw audio stream in, text or JSON out * `asr_adapter_wav2text.py` * WAV file(s) in, text or JSON out (per file) * `handle_adapter_json.py` * Intent JSON in, text response out * `handle_adapter_text.py` * Transcription in, text response out * `mic_adapter_raw.py` * Raw audio stream in * `snd_adapter_raw.py` * Raw audio stream out * `tts_adapter_http.py` * HTTP POST to endpoint with text, WAV out * `tts_adapter_text2wav.py` * Text in, WAV out * `vad_adapter_raw.py` * Raw audio stream in, speech probability out (one line per chunk) * `wake_adapter_raw.py` * Raw audio stream in, name of detected model out (one line per detection) * `client_unix_socket.py` * Send/receive events over Unix domain socket ![Wyoming protocol adapter](img/adapter.png) ================================================ FILE: docs/domains.md ================================================ # Domains Programs belong to a specific domain. This defines the kinds of [events](wyoming.md) they are expected to receive and emit. ## mic Emits `audio-chunk` events, ideally with a `timestamp`. ## wake Receives `audio-chunk` events. Emits `detection` event(s) or a `not-detected` event if the program exits without a detection. ## asr Receives an `audio-start` event, followed by zero or more `audio-chunk` events. An `audio-stop` event must trigger a `transcript` event to be emitted. ## vad Receives `audio-chunk` events. Emits `voice-started` with the `timestamp` of the `audio-chunk` when the user started speaking. Emits `voice-stopped` with the `timestamp` of the `audio-chunk` when the user finished speaking. ## intent Optional. The `handle` domain can handle `transcript` events directly. Receives `recognize` events. Emits either an `intent` or a `not-recognized` event. ## handle Receives one of the following event types: `transcript`, `intent`, or `not-recognized`. Emits either a `handle` or `not-handled` event. ## tts Receives a `synthesize` event. Emits an `audio-start` event followed by zero or more `audio-chunk` events, and then an `audio-stop` event. ## snd Receives `audio-chunk` events until an `audio-stop` event. Must emit `played` event when audio has finished playing. ================================================ FILE: docs/home_assistant.md ================================================ # Home Assistant This will connect Rhasspy to Home Assistant via [Assist](https://www.home-assistant.io/docs/assist). Install the Home Assistant intent handler: ```sh mkdir -p config/programs/handle/ cp -R programs/handle/home_assistant config/programs/handle/ ``` Create a long-lived access token in Home Assistant (inside your profile): ![Long-lived access token](img/ha_token.png) Copy the **entire** access token (with CTRL+A, not just selecting what you can see) and put it in the data directory: ```sh mkdir -p config/data/handle/home_assistant/ echo "MY-LONG-LIVED-ACCESS-TOKEN" > config/data/handle/home_assistant/token ``` Add to your `configuration.yaml`: ```yaml programs: handle: home_assistant: command: | bin/converse.py --language "${language}" "${url}" "${token_file}" adapter: | handle_adapter_text.py template_args: url: "http://localhost:8123/api/conversation/process" token_file: "${data_dir}/token" language: "en" pipelines: default: mic: ... vad: ... asr: ... wake: ... handle: name: home_assistant tts: ... snd: ... ``` Make sure your Home Assistant server is running, and test out a command: ```sh script/run bin/handle_text.py "Turn on the bed light" ``` Replace "bed light" with the name of a device you have connected to Home Assistant. If successful, you should see JSON printed with the response text, like: ```sh {"type": "handled", "data": {"text": "Turned on light"}} ``` This also works over HTTP: ```sh curl -X POST --data 'Turn on the bed light' 'localhost:13331/handle/handle' ``` Now you can run your full pipeline and control Home Assistant! ================================================ FILE: docs/satellite.md ================================================ # Satellite Once you have a Rhasspy HTTP server running, you can use Rhasspy as a satellite on a separate device. **NOTE:** Rhasspy satellites do not need to run Python or any Rhasspy software. They can use the websocket API directly, or talk directly to a running pipeline. On your satellite, clone the repo: ```sh git clone https://github.com/rhasspy/rhasspy3 cd rhasspy3 ``` Install the websocket utility: ```sh mkdir -p config/programs/remote/ cp -R programs/remote/websocket config/programs/remote/ config/programs/remote/websocket/script/setup ``` Install [Porcupine](https://github.com/Picovoice/porcupine): ```sh mkdir -p config/programs/wake/ cp -R programs/wake/porcupine1 config/programs/wake/ config/programs/wake/porcupine1/script/setup ``` Check available wake word models by running ```sh config/programs/wake/porcupine1/script/list_models ``` and choose one. We'll use "porcupine_linux.ppn" as an example, but this will be **different on a Raspberry Pi**. Next, create `config/configuration.yaml` with: ```yaml programs: mic: arecord: command: | arecord -q -r 16000 -c 1 -f S16_LE -t raw - adapter: | mic_adapter_raw.py --samples-per-chunk 1024 --rate 16000 --width 2 --channels 1 wake: porcupine1: command: | .venv/bin/python3 bin/porcupine_stream.py --model "${model}" template_args: model: "porcupine_linux.ppn" remote: websocket: command: | script/run "${uri}" template_args: uri: "ws://localhost:13331/pipeline/asr-tts" satellites: default: mic: name: arecord wake: name: porcupine1 remote: name: websocket snd: name: aplay ``` Replace the model in `porcupine1` with your selection, and adjust the URI in `websocket` to point to your Rhasspy server. Now you can run your satellite: ```sh script/run bin/satellite_run.py --debug --loop ``` (say "porcupine", *pause*, say voice command, *wait*) If everything is working, you should hear a response being spoken. Press CTRL+C to quit. ================================================ FILE: docs/tutorial.md ================================================ # Tutorial Welcome to Rhasspy 3! This is a developer preview, so many of the manual steps here will be replaced with something more user-friendly in the future. ## Installing Rhasspy 3 To get started, just clone the repo. Rhasspy's core does not currently have any dependencies outside the Python standard library. ```sh git clone https://github.com/rhasspy/rhasspy3 cd rhasspy3 ``` ## Layout Installed programs and downloaded models are stored in the `config` directory, which is empty by default: * `rhasspy3/config/` * `configuration.yaml` - overrides `rhasspy3/configuration.yaml` * `programs/` - installed programs * `/` * `/` * `data/` - downloaded models * `/` * `/` Programs in Rhasspy are divided into [domains](domains.md). ## Configuration Rhasspy loads two configuration files: 1. `rhasspy3/configuration.yaml` (base) 2. `config/configuration.yaml` (user) The file in `config` will override the base configuration. You can see what the final configuration looks like with: ```sh script/run bin/config_print.py ``` ## Microphone Programs that were not designed for Rhasspy can be used with [adapters](adapters.md). For example, add the following to your `configuration.yaml` (in the `config` directory): ```yaml programs: mic: arecord: command: | arecord -q -r 16000 -c 1 -f S16_LE -t raw - adapter: | mic_adapter_raw.py --rate 16000 --width 2 --channels 1 pipelines: default: mic: name: arecord ``` Now you can run a microphone test: ```sh script/run bin/mic_test_energy.py ``` When speaking, you should see the bar change with volume. If not, check the available devices with `arecord -L` and update the `arecord` command in `configuration.yaml` with `-D ` (prefer devices that start with `plughw:`). Press CTRL+C to quit. Pipelines will be discussed later. For now, know that the pipeline named `default` will be run if you don't specify one. The mic test script can do this: ```sh script/run bin/mic_test_energy.py --pipeline my-pipeline ``` You can also override the mic program: ```sh script/run bin/mic_test_energy.py --mic-program other-program-from-config ``` ## Voice Activity Detection Let's install our first program, [Silero VAD](https://github.com/snakers4/silero-vad/). Start by copying from `programs/` to `config/programs`, then run the setup script: ```sh mkdir -p config/programs/vad/ cp -R programs/vad/silero config/programs/vad/ config/programs/vad/silero/script/setup ``` Once the setup script completes, add the following to your `configuration.yaml`: ```yaml programs: mic: ... vad: silero: command: | script/speech_prob "share/silero_vad.onnx" adapter: | vad_adapter_raw.py --rate 16000 --width 2 --channels 1 --samples-per-chunk 512 pipelines: default: mic: ... vad: name: silero ``` This calls a command inside `config/programs/vad/silero` and uses an adapter. Notice that the command's working directory will always be `config/programs//`. You can test out the voice activity detection (VAD) by recording an audio sample: ```sh script/run bin/mic_record_sample.py sample.wav ``` Say something for a few seconds and then wait for the program to finish. Afterwards, listen to `sample.wav` and verify that it sounds correct. You may need to adjust microphone settings with `alsamixer` ## Speech to Text Now for the fun part! We'll be installing [faster-whisper](https://github.com/guillaumekln/faster-whisper/), an optimized version of Open AI's [Whisper](https://github.com/openai/whisper) model. ```sh mkdir -p config/programs/asr/ cp -R programs/asr/faster-whisper config/programs/asr/ config/programs/asr/faster-whisper/script/setup ``` Before using faster-whisper, we need to download a model: ```sh config/programs/asr/faster-whisper/script/download.py tiny-int8 ``` Notice that the model was downloaded to `config/data/asr/faster-whisper`: ```sh find config/data/asr/faster-whisper/ config/data/asr/faster-whisper/ config/data/asr/faster-whisper/tiny-int8 config/data/asr/faster-whisper/tiny-int8/vocabulary.txt config/data/asr/faster-whisper/tiny-int8/model.bin config/data/asr/faster-whisper/tiny-int8/config.json ``` The `tiny-int8` model is the smallest and fastest model, but may not give the best transcriptions. Run `download.py` without any arguments to see the available models, or follow [the instructions](https://github.com/guillaumekln/faster-whisper/#model-conversion) to make your own! Add the following to `configuration.yaml`: ```yaml programs: mic: ... vad: ... asr: faster-whisper: command: | script/wav2text "${data_dir}/tiny-int8" "{wav_file}" adapter: | asr_adapter_wav2text.py pipelines: default: mic: ... vad: ... asr: name: faster-whisper ``` You can now transcribe a voice command: ```sh script/run bin/asr_transcribe.py ``` (say something) You should see a transcription of what you said as part of an [event](wyoming.md). ### Client/Server Speech to text systems can take a while to load their models, so a lot of time is wasted if we start from scratch each time. Some speech to text and text to speech programs have included servers. These usually use [Unix domain sockets](https://en.wikipedia.org/wiki/Unix_domain_socket) to communicate with a small client program. Add the following to your `configuration.yaml`: ```yaml programs: mic: ... vad: ... asr: faster-whisper: ... faster-whisper.client: command: | client_unix_socket.py var/run/faster-whisper.socket servers: asr: faster-whisper: command: | script/server --language "en" "${data_dir}/tiny-int8" pipelines: default: mic: ... vad: ... asr: name: faster-whisper.client ``` Start the server in a separate terminal: ```sh script/run bin/server_run.py asr faster-whisper ``` When it prints "Ready", transcribe yourself speaking again: ```sh script/run bin/asr_transcribe.py ``` (say something) You should receive your transcription a bit faster than before. ### HTTP Server Rhasspy includes a small HTTP server that allows you to access programs and pipelines over a web API. To get started, run the setup script: ```sh script/setup_http_server ``` Run HTTP server in a separate terminal: ```sh script/http_server --debug ``` Now you can transcribe a WAV file over HTTP: ```sh curl -X POST -H 'Content-Type: audio/wav' --data-binary @etc/what_time_is_it.wav 'localhost:13331/asr/transcribe' ``` You can run one or more program servers along with the HTTP server: ```sh script/http_server --debug --server asr faster-whisper ``` **NOTE:** You will need to restart the HTTP server when you change `configuration.yaml` ## Wake Word Detection Next, we'll install [Porcupine](https://github.com/Picovoice/porcupine): ```sh mkdir -p config/programs/wake/ cp -R programs/wake/porcupine1 config/programs/wake/ config/programs/wake/porcupine1/script/setup ``` Check available wake word models with: ```sh config/programs/wake/porcupine1/script/list_models alexa_linux.ppn americano_linux.ppn blueberry_linux.ppn bumblebee_linux.ppn computer_linux.ppn grapefruit_linux.ppn grasshopper_linux.ppn hey google_linux.ppn hey siri_linux.ppn jarvis_linux.ppn ok google_linux.ppn pico clock_linux.ppn picovoice_linux.ppn porcupine_linux.ppn smart mirror_linux.ppn snowboy_linux.ppn terminator_linux.ppn view glass_linux.ppn ``` **NOTE:** These will be slightly different on a Raspberry Pi (`_raspberry-pi.ppn` instead of `_linux.ppn`). Add to `configuration.yaml`: ```yaml programs: mic: ... vad: ... asr: ... wake: porcupine1: command: | .venv/bin/python3 bin/porcupine_stream.py --model "${model}" template_args: model: "porcupine_linux.ppn" servers: asr: ... pipelines: default: mic: ... vad: ... asr: ... wake: name: porcupine1 ``` Notice that we include `template_args` in the `programs` section. This lets us change specific settings in `pipelines`, which will be demonstrated in a moment. Test wake word detection: ```sh script/run bin/wake_detect.py --debug ``` (say "porcupine") Now change the model in `configuration.yaml`: ```yaml programs: mic: ... vad: ... asr: ... wake: ... servers: asr: ... pipelines: default: mic: ... vad: ... asr: ... wake: name: porcupine1 template_args: model: "grasshopper_linux.ppn" ``` Test wake word detection again: ```sh script/run bin/wake_detect.py --debug ``` (say "grasshopper") For non-English models, first download the extra data files: ```sh config/programs/wake/porcupine1/script/download.py ``` Next, adjust your `configuration.yaml`. For example, this uses the German keyword "ananas": ```yaml programs: wake: porcupine1: command: | .venv/bin/python3 bin/porcupine_stream.py --model "${model}" --lang_model "${lang_model}" template_args: model: "${data_dir}/resources/keyword_files_de/linux/ananas_linux.ppn" lang_model: "${data_dir}/lib/common/porcupine_params_de.pv" ``` Inspect the files in `config/data/wake/porcupine1` for supported languages and keywords. At this time, English, German (de), French (fr), and Spanish (es) are available with keywords for `linux`, `raspberry-pi`, and many other platforms. Going back to "grasshopper", we can test over HTTP server (restart server): ```sh curl -X POST 'localhost:13331/pipeline/run?stop_after=wake' ``` (say "grasshopper") Test full voice command: ```sh curl -X POST 'localhost:13331/pipeline/run?stop_after=asr' ``` (say "grasshopper", *pause*, voice command, *wait*) ## Intent Handling There are two types of intent handlers in Rhasspy, ones that handle transcripts directly (text) and others that handle structured intents (name + entities). For this example, we will be handling text directly from `asr`. In `configuration.yaml`: ```yaml programs: mic: ... vad: ... asr: ... wake: ... handle: date_time: command: | bin/date_time.py adapter: | handle_adapter_text.py servers: asr: ... pipelines: default: mic: ... vad: ... asr: ... wake: ... handle: name: date_time ``` Install date time demo script: ```sh mkdir -p config/programs/handle/ cp -R programs/handle/date_time config/programs/handle/ ``` This script just looks for the words "date" and "time" in the text, and responds appropriately. You can test it on some text: ```sh echo 'What time is it?' | script/run bin/handle_text.py --debug ``` Now let's test it with a full voice command: ```sh script/run bin/pipeline_run.py --debug --stop-after handle ``` (say "grasshopper", *pause*, "what time is it?") It works too over HTTP (restart server): ```sh curl -X POST 'localhost:13331/pipeline/run?stop_after=handle' ``` (say "grasshopper", *pause*, "what's the date?") ## Text to Speech and Sound The final stages of our pipeline will be text to speech (`tts`) and audio output (`snd`). Install [Piper](https://github.com/rhasspy/piper): ```sh mkdir -p config/programs/tts/ cp -R programs/tts/piper config/programs/tts/ config/programs/tts/piper/script/setup.py ``` and download an English voice: ```sh config/programs/tts/piper/script/download.py english ``` Call `download.py` without any arguments to see available voices. Add to `configuration.yaml`: ```yaml programs: mic: ... vad: ... asr: ... wake: ... handle: ... tts: piper: command: | bin/piper --model "${model}" --output_file - adapter: | tts_adapter_text2wav.py template_args: model: "${data_dir}/en-us-blizzard_lessac-medium.onnx" snd: aplay: command: | aplay -q -r 22050 -f S16_LE -c 1 -t raw adapter: | snd_adapter_raw.py --rate 22050 --width 2 --channels 1 servers: asr: ... pipelines: default: mic: ... vad: ... asr: ... wake: ... handle: ... tts: name: piper snd: name: aplay ``` We can test the text to speech and audio output programs: ```sh script/run bin/tts_speak.py 'Welcome to the world of speech synthesis.' ``` The `bin/tts_synthesize.py` can be used if you want to just output a WAV file. ```sh script/run bin/tts_synthesize.py 'Welcome to the world of speech synthesis.' > welcome.wav ``` This also works over HTTP (restart server): ```sh curl -X POST \ --data 'Welcome to the world of speech synthesis.' \ --output welcome.wav \ 'localhost:13331/tts/synthesize' ``` Or to speak over HTTP: ```sh curl -X POST --data 'Welcome to the world of speech synthesis.' 'localhost:13331/tts/speak' ``` ### Client/Server Like speech to text, text to speech models can take a while to load. Let's add a server for Piper to `configuration.yaml`: ```yaml programs: mic: ... vad: ... asr: ... wake: ... handle: ... tts: piper.client: command: | client_unix_socket.py var/run/piper.socket snd: ... servers: asr: ... tts: piper: command: | script/server "${model}" template_args: model: "${data_dir}/en-us-blizzard_lessac-medium.onnx" pipelines: default: mic: ... vad: ... asr: ... wake: ... handle: ... tts: name: piper.client snd: ... ``` Now we can run both servers with the HTTP server: ```sh script/http_server --debug --server asr faster-whisper --server tts piper ``` Text to speech requests should be faster now. ## Complete Pipeline As a final example, let's run a complete pipeline from wake word detection to text to speech response: ```sh script/run bin/pipeline_run.py --debug ``` (say "grasshopper", *pause*, "what time is it?", *wait*) Rhasspy should speak the current time. This also works over HTTP: ```sh curl -X POST 'localhost:13331/pipeline/run' ``` (say "grasshopper", *pause*, "what is the date?", *wait*) Rhasspy should speak the current date. ## Next Steps * Connect Rhasspy to [Home Assistant](home_assistant.md) * Run one or more [satellites](satellite.md) ================================================ FILE: docs/wyoming.md ================================================ # The Wyoming Protocol An interprocess event protocol over stdin/stdout for Rhasspy v3. (effectively [JSONL](https://jsonlines.org/) with an optional binary payload) ![Wyoming protocol](img/wyoming.png) ## Motivation Rhasspy v2 was built on top of MQTT, and therefore required (1) an MQTT broker and (2) all services to talk over MQTT. Each open source voice program needed a custom service wrapper to talk to Rhasspy. For v3, a project goal was to minimize the barrier for programs to talk to Rhasspy. ## Talking Directly to Programs Many voice programs have similar command line interfaces. For example, most text to speech programs accept text through standard input and write a WAV file to standard output or a file: ```sh echo “Input text” | text-to-speech > output.wav ``` A protocol based on standard input/output would be universal across languages, operating systems, etc. However, some voice programs need to consume or produce audio/event streams. For example, a speech to text system may return a result much quicker if it can process audio as it's being recorded. ## Event Streams Standard input/output are byte streams, but they can be easily adapted to event streams that can also carry binary data. This lets us send, for example, chunks of audio to a speech to text program as well as an event to say the stream is finished. All without a broker or a socket! Each **event** in the Wyoming protocol is: 1. A **single line** of JSON with an object: * **MUST** have a `type` field with an event type name * MAY have a `data` field with an object that contains event-specific data * MAY have a `payload_length` field with a number > 0 2. If `payload_length` is given, *exactly* that may bytes follows Example: ```json { "type": "audio-chunk", "data": { "rate": 16000, "width", "channels": 1 }, "payload_length": 2048 } <2048 bytes> ``` ## Adapter Using events over standard input/output unfortunately means we cannot talk to most programs directly. Fortunately, [small adapters](adapters.md) can be written and shared for programs with similar command-line interfaces. The adapter speaks events to Rhasspy, but calls the underlying program according to a common convention like “text in, WAV out”. ![Wyoming protocol adapter](img/adapter.png) ## Events Types Voice programs vary significantly in their options, but programs within the same [domain](domains.md) have the same minimal requirements to function: * mic * Audio input * Outputs fixed-sized chunks of PCM audio from a microphone, socket, etc. * Audio chunks may contain timestamps * wake * Wake word detection * Inputs fixed-sized chunks of PCM audio * Outputs name of detected model, timestamp of audio chunk * asr * Speech to text * Inputs fixed-sized chunks of PCM audio * Inputs an event indicating the end of the audio stream (or voice command) * Outputs a transcription * vad * Voice activity detection * Inputs fixed-sized chunks of PCM audio * Outputs events indicating the beginning and end of a voice command * intent * Intent recognition * Inputs text * Outputs an intent with a name and entities (slots) * handle * Intent/text handling * Does something with an intent or directly with a transcription * Outputs a text response * tts * Text to speech * Inputs text * Outputs one or more fixed-sized chunks of PCM audio * snd * Audio output * Inputs fixed-sized chunks of PCM audio * Plays audio through a sound system The following event types are currently defined: | Domain | Type | Data | Payload | |--------|----------------|----------------------------------|---------| | audio | audio-start | timestamp, rate, width, channels | | | audio | audio-chunk | timestamp, rate, width, channels | PCM | | audio | audio-stop | timestamp | | | wake | detection | name, timestamp | | | wake | not-detected | | | | vad | voice-started | timestamp | | | vad | voice-stopped | timestamp | | | asr | transcript | text | | | intent | recognize | text | | | intent | intent | name, entities | | | intent | not-recognized | text | | | handle | handled | text | | | handle | not-handled | text | | | tts | synthesize | text | | | snd | played | | | ================================================ FILE: examples/satellite/configuration.yaml ================================================ satellites: default: mic: name: arecord template_args: device: "default" wake: name: porcupine1 template_args: model: "porcupine_raspberry-pi.ppn" remote: name: websocket template_args: uri: "ws://homeassistant.local:13331/pipeline/asr-tts" snd: name: aplay template_args: device: "default" ================================================ FILE: mypy.ini ================================================ [mypy] ignore_missing_imports = true [mypy-setuptools.*] ignore_missing_imports = True ================================================ FILE: programs/asr/coqui-stt/README.md ================================================ # Coqui STT Speech to text service for Rhasspy based on [Coqui STT](https://stt.readthedocs.io/en/latest/). Additional models can be downloaded here: https://coqui.ai/models/ ## Installation 1. Copy the contents of this directory to `config/programs/asr/coqui-stt/` 2. Run `script/setup` 3. Download a model with `script/download.py` * Example: `script/download.py en_large` * Models are downloaded to `config/data/asr/coqui-stt` directory 4. Test with `script/wav2text` * Example `script/wav2text /path/to/english_v1.0.0-large-vocab/ /path/to/test.wav` ================================================ FILE: programs/asr/coqui-stt/bin/coqui_stt_raw2text.py ================================================ #!/usr/bin/env python3 import argparse import logging import sys from pathlib import Path import numpy as np from stt import Model _LOGGER = logging.getLogger("coqui_stt_raw2text") def main() -> None: parser = argparse.ArgumentParser() parser.add_argument("model", help="Path to Coqui STT model directory") parser.add_argument( "--scorer", help="Path to scorer (default: .scorer file in model directory)" ) parser.add_argument( "--alpha-beta", type=float, nargs=2, metavar=("alpha", "beta"), help="Scorer alpha/beta", ) parser.add_argument( "--samples-per-chunk", type=int, default=1024, help="Number of samples to process at a time", ) parser.add_argument("--debug", action="store_true", help="Log DEBUG messages") args = parser.parse_args() logging.basicConfig(level=logging.DEBUG if args.debug else logging.INFO) model_dir = Path(args.model) model_path = next(model_dir.glob("*.tflite")) if args.scorer: scorer_path = Path(args.scorer) else: scorer_path = next(model_dir.glob("*.scorer")) _LOGGER.debug("Loading model: %s, scorer: %s", model_path, scorer_path) model = Model(str(model_path)) model.enableExternalScorer(str(scorer_path)) if args.alpha_beta is not None: model.setScorerAlphaBeta(*args.alpha_beta) model_stream = model.createStream() chunk = sys.stdin.buffer.read(args.samples_per_chunk) _LOGGER.debug("Processing audio") while chunk: chunk_array = np.frombuffer(chunk, dtype=np.int16) model_stream.feedAudioContent(chunk_array) chunk = sys.stdin.buffer.read(args.samples_per_chunk) text = model_stream.finishStream() _LOGGER.debug(text) print(text.strip()) # ----------------------------------------------------------------------------- if __name__ == "__main__": main() ================================================ FILE: programs/asr/coqui-stt/bin/coqui_stt_server.py ================================================ #!/usr/bin/env python3 import argparse import json import logging import os import socket import threading from pathlib import Path import numpy as np from stt import Model _LOGGER = logging.getLogger("coqui_stt_server") def main(): parser = argparse.ArgumentParser() parser.add_argument("model", help="Path to Coqui STT model directory") parser.add_argument( "--scorer", help="Path to scorer (default: .scorer file in model directory)" ) parser.add_argument( "--alpha-beta", type=float, nargs=2, metavar=("alpha", "beta"), help="Scorer alpha/beta", ) parser.add_argument( "--socketfile", required=True, help="Path to Unix domain socket file" ) parser.add_argument( "-r", "--rate", type=int, default=16000, help="Input audio sample rate (default: 16000)", ) parser.add_argument("--debug", action="store_true", help="Log DEBUG messages") args = parser.parse_args() logging.basicConfig(level=logging.DEBUG if args.debug else logging.INFO) # Need to unlink socket if it exists try: os.unlink(args.socketfile) except OSError: pass try: # Create socket server sock = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) sock.bind(args.socketfile) sock.listen() model_dir = Path(args.model) model_path = next(model_dir.glob("*.tflite")) if args.scorer: scorer_path = Path(args.scorer) else: scorer_path = next(model_dir.glob("*.scorer")) _LOGGER.debug("Loading model: %s, scorer: %s", model_path, scorer_path) model = Model(str(model_path)) model.enableExternalScorer(str(scorer_path)) if args.alpha_beta is not None: model.setScorerAlphaBeta(*args.alpha_beta) _LOGGER.info("Ready") # Listen for connections while True: try: connection, client_address = sock.accept() _LOGGER.debug("Connection from %s", client_address) # Start new thread for client threading.Thread( target=handle_client, args=(connection, model, args.rate), daemon=True, ).start() except KeyboardInterrupt: break except Exception: _LOGGER.exception("Error communicating with socket client") finally: os.unlink(args.socketfile) def handle_client(connection: socket.socket, model: Model, rate: int) -> None: try: model_stream = model.createStream() is_first_audio = True with connection, connection.makefile(mode="rwb") as conn_file: while True: event_info = json.loads(conn_file.readline()) event_type = event_info["type"] if event_type == "audio-chunk": if is_first_audio: _LOGGER.debug("Receiving audio") is_first_audio = False num_bytes = event_info["payload_length"] chunk = conn_file.read(num_bytes) chunk_array = np.frombuffer(chunk, dtype=np.int16) model_stream.feedAudioContent(chunk_array) elif event_type == "audio-stop": _LOGGER.info("Audio stopped") text = model_stream.finishStream() transcript_str = ( json.dumps( {"type": "transcript", "data": {"text": text}}, ensure_ascii=False, ) + "\n" ) conn_file.write(transcript_str.encode()) break except Exception: _LOGGER.exception("Unexpected error in client thread") # ----------------------------------------------------------------------------- if __name__ == "__main__": main() ================================================ FILE: programs/asr/coqui-stt/bin/coqui_stt_wav2text.py ================================================ #!/usr/bin/env python3 import argparse import logging import wave from pathlib import Path import numpy as np from stt import Model _LOGGER = logging.getLogger("coqui_stt_wav2text") def main() -> None: parser = argparse.ArgumentParser() parser.add_argument("model", help="Path to Coqui STT model directory") parser.add_argument("wav_file", nargs="+", help="Path to WAV file(s) to transcribe") parser.add_argument( "--scorer", help="Path to scorer (default: .scorer file in model directory)" ) parser.add_argument( "--alpha-beta", type=float, nargs=2, metavar=("alpha", "beta"), help="Scorer alpha/beta", ) parser.add_argument("--debug", action="store_true", help="Log DEBUG messages") args = parser.parse_args() logging.basicConfig(level=logging.DEBUG if args.debug else logging.INFO) model_dir = Path(args.model) model_path = next(model_dir.glob("*.tflite")) if args.scorer: scorer_path = Path(args.scorer) else: scorer_path = next(model_dir.glob("*.scorer")) _LOGGER.debug("Loading model: %s, scorer: %s", model_path, scorer_path) model = Model(str(model_path)) model.enableExternalScorer(str(scorer_path)) if args.alpha_beta is not None: model.setScorerAlphaBeta(*args.alpha_beta) for wav_path in args.wav_file: _LOGGER.debug("Processing %s", wav_path) wav_file: wave.Wave_read = wave.open(wav_path, "rb") with wav_file: assert wav_file.getframerate() == 16000, "16Khz sample rate required" assert wav_file.getsampwidth() == 2, "16-bit samples required" assert wav_file.getnchannels() == 1, "Mono audio required" audio_bytes = wav_file.readframes(wav_file.getnframes()) model_stream = model.createStream() audio_array = np.frombuffer(audio_bytes, dtype=np.int16) model_stream.feedAudioContent(audio_array) text = model_stream.finishStream() print(text.strip()) # ----------------------------------------------------------------------------- if __name__ == "__main__": main() ================================================ FILE: programs/asr/coqui-stt/requirements.txt ================================================ stt>=1.4.0,<2.0 numpy ================================================ FILE: programs/asr/coqui-stt/script/download.py ================================================ #!/usr/bin/env python3 import argparse import itertools import logging import tarfile from pathlib import Path from urllib.request import urlopen _DIR = Path(__file__).parent _LOGGER = logging.getLogger("setup") MODELS = {"en_large": "english_v1.0.0-large-vocab"} def main() -> None: parser = argparse.ArgumentParser() parser.add_argument( "model", nargs="+", choices=list(itertools.chain(MODELS.keys(), MODELS.values())), help="Coqui STT model(s) to download", ) parser.add_argument( "--destination", help="Path to destination directory (default: share)" ) parser.add_argument( "--link-format", default="https://github.com/rhasspy/models/releases/download/v1.0/asr_coqui-stt-{model}.tar.gz", help="Format string for download URLs", ) args = parser.parse_args() logging.basicConfig(level=logging.INFO) if args.destination: args.destination = Path(args.destination) else: # Assume we're in programs/asr/coqui-stt/script data_dir = _DIR.parent.parent.parent.parent / "data" args.destination = data_dir / "asr" / "coqui-stt" args.destination.parent.mkdir(parents=True, exist_ok=True) for model in args.model: model = MODELS.get(model, model) url = args.link_format.format(model=model) _LOGGER.info("Downloading %s", url) with urlopen(url) as response: with tarfile.open(mode="r|*", fileobj=response) as tar_gz: _LOGGER.info("Extracting to %s", args.destination) tar_gz.extractall(args.destination) if __name__ == "__main__": main() ================================================ FILE: programs/asr/coqui-stt/script/raw2text ================================================ #!/usr/bin/env bash set -eo pipefail # Directory of *this* script this_dir="$( cd "$( dirname "$0" )" && pwd )" # Base directory of repo base_dir="$(realpath "${this_dir}/..")" # Path to virtual environment : "${venv:=${base_dir}/.venv}" if [ -d "${venv}" ]; then source "${venv}/bin/activate" fi python3 "${base_dir}/bin/coqui_stt_raw2text.py" "$@" ================================================ FILE: programs/asr/coqui-stt/script/server ================================================ #!/usr/bin/env bash set -eo pipefail # Directory of *this* script this_dir="$( cd "$( dirname "$0" )" && pwd )" # Base directory of repo base_dir="$(realpath "${this_dir}/..")" # Path to virtual environment : "${venv:=${base_dir}/.venv}" if [ -d "${venv}" ]; then source "${venv}/bin/activate" fi socket_dir="${base_dir}/var/run" mkdir -p "${socket_dir}" python3 "${base_dir}/bin/coqui_stt_server.py" --socketfile "${socket_dir}/coqui-stt.socket" "$@" ================================================ FILE: programs/asr/coqui-stt/script/setup ================================================ #!/usr/bin/env bash set -eo pipefail # Directory of *this* script this_dir="$( cd "$( dirname "$0" )" && pwd )" # Base directory of repo base_dir="$(realpath "${this_dir}/..")" # Path to virtual environment : "${venv:=${base_dir}/.venv}" # Python binary to use : "${PYTHON=python3}" python_version="$(${PYTHON} --version)" if [ ! -d "${venv}" ]; then # Create virtual environment echo "Creating virtual environment at ${venv} (${python_version})" rm -rf "${venv}" "${PYTHON}" -m venv "${venv}" source "${venv}/bin/activate" pip3 install --upgrade pip pip3 install --upgrade wheel setuptools else source "${venv}/bin/activate" fi # Install Python dependencies echo 'Installing Python dependencies' pip3 install -r "${base_dir}/requirements.txt" # ----------------------------------------------------------------------------- echo "OK" ================================================ FILE: programs/asr/coqui-stt/script/wav2text ================================================ #!/usr/bin/env bash set -eo pipefail # Directory of *this* script this_dir="$( cd "$( dirname "$0" )" && pwd )" # Base directory of repo base_dir="$(realpath "${this_dir}/..")" # Path to virtual environment : "${venv:=${base_dir}/.venv}" if [ -d "${venv}" ]; then source "${venv}/bin/activate" fi python3 "${base_dir}/bin/coqui_stt_wav2text.py" "$@" ================================================ FILE: programs/asr/faster-whisper/README.md ================================================ # Faster Whisper Speech to text service for Rhasspy based on [faster-whisper](https://github.com/guillaumekln/faster-whisper/). Additional models can be downloaded here: https://github.com/rhasspy/models/releases/tag/v1.0 ## Installation 1. Copy the contents of this directory to `config/programs/asr/faster-whisper/` 2. Run `script/setup.py` 3. Download a model with `script/download.py` * Example: `script/download.py tiny-int8` * Models are downloaded to `config/data/asr/faster-whisper` directory 4. Test with `script/wav2text` * Example `script/wav2text /path/to/tiny-int8/ /path/to/test.wav` ================================================ FILE: programs/asr/faster-whisper/bin/faster_whisper_server.py ================================================ #!/usr/bin/env python3 import argparse import io import logging import os import socket import wave from pathlib import Path from faster_whisper import WhisperModel from rhasspy3.asr import Transcript from rhasspy3.audio import AudioChunk, AudioStop from rhasspy3.event import read_event, write_event _FILE = Path(__file__) _DIR = _FILE.parent _LOGGER = logging.getLogger(_FILE.stem) def main() -> None: parser = argparse.ArgumentParser() parser.add_argument("model", help="Path to faster-whisper model directory") parser.add_argument( "--socketfile", required=True, help="Path to Unix domain socket file" ) parser.add_argument( "--device", default="cpu", help="Device to use for inference (default: cpu)", ) parser.add_argument( "--language", help="Language to set for transcription", ) parser.add_argument( "--compute-type", default="default", help="Compute type (float16, int8, etc.)", ) parser.add_argument( "--beam-size", type=int, default=1, ) parser.add_argument("--debug", action="store_true", help="Log DEBUG messages") args = parser.parse_args() logging.basicConfig(level=logging.DEBUG if args.debug else logging.INFO) # Need to unlink socket if it exists try: os.unlink(args.socketfile) except OSError: pass try: # Create socket server sock = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) sock.bind(args.socketfile) sock.listen() # Load converted faster-whisper model model = WhisperModel( args.model, device=args.device, compute_type=args.compute_type ) _LOGGER.info("Ready") # Listen for connections while True: try: connection, client_address = sock.accept() _LOGGER.debug("Connection from %s", client_address) is_first_audio = True with connection, connection.makefile( mode="rwb" ) as conn_file, io.BytesIO() as wav_io: wav_file: wave.Wave_write = wave.open(wav_io, "wb") with wav_file: while True: event = read_event(conn_file) # type: ignore if event is None: break if AudioChunk.is_type(event.type): chunk = AudioChunk.from_event(event) if is_first_audio: _LOGGER.debug("Receiving audio") wav_file.setframerate(chunk.rate) wav_file.setsampwidth(chunk.width) wav_file.setnchannels(chunk.channels) is_first_audio = False wav_file.writeframes(chunk.audio) elif AudioStop.is_type(event.type): _LOGGER.debug("Audio stopped") break wav_io.seek(0) segments, _info = model.transcribe( wav_io, beam_size=args.beam_size, language=args.language, ) text = " ".join(segment.text for segment in segments) _LOGGER.info(text) write_event(Transcript(text=text).event(), conn_file) # type: ignore except KeyboardInterrupt: break except Exception: _LOGGER.exception("Error communicating with socket client") finally: os.unlink(args.socketfile) # ----------------------------------------------------------------------------- if __name__ == "__main__": main() ================================================ FILE: programs/asr/faster-whisper/bin/faster_whisper_wav2text.py ================================================ #!/usr/bin/env python3 import argparse import logging import time from pathlib import Path from faster_whisper import WhisperModel _FILE = Path(__file__) _DIR = _FILE.parent _LOGGER = logging.getLogger(_FILE.stem) def main() -> None: parser = argparse.ArgumentParser() parser.add_argument("model", help="Path to faster-whisper model directory") parser.add_argument("wav_file", nargs="+", help="Path to WAV file(s) to transcribe") parser.add_argument( "--device", default="cpu", help="Device to use for inference (default: cpu)", ) parser.add_argument( "--language", help="Language to set for transcription", ) parser.add_argument( "--compute-type", default="default", help="Compute type (float16, int8, etc.)", ) parser.add_argument( "--beam-size", type=int, default=1, ) parser.add_argument("--debug", action="store_true", help="Log DEBUG messages") args = parser.parse_args() logging.basicConfig(level=logging.DEBUG if args.debug else logging.INFO) # Load converted faster-whisper model _LOGGER.debug("Loading model: %s", args.model) model = WhisperModel(args.model, device=args.device, compute_type=args.compute_type) _LOGGER.info("Model loaded") for wav_path in args.wav_file: _LOGGER.debug("Processing %s", wav_path) start_time = time.monotonic_ns() segments, _info = model.transcribe( wav_path, beam_size=args.beam_size, language=args.language, ) text = " ".join(segment.text for segment in segments) end_time = time.monotonic_ns() _LOGGER.debug( "Transcribed %s in %s second(s)", wav_path, (end_time - start_time) / 1e9 ) _LOGGER.debug(text) print(text, flush=True) # ----------------------------------------------------------------------------- if __name__ == "__main__": main() ================================================ FILE: programs/asr/faster-whisper/script/download.py ================================================ #!/usr/bin/env python3 import argparse import logging import tarfile from pathlib import Path from urllib.request import urlopen _DIR = Path(__file__).parent _LOGGER = logging.getLogger("setup") MODELS = [ "tiny", "tiny-int8", "base", "base-int8", "small", "small-int8", ] def main() -> None: parser = argparse.ArgumentParser() parser.add_argument( "model", nargs="+", choices=MODELS, help="faster-whisper model(s) to download", ) parser.add_argument( "--destination", help="Path to destination directory (default: share)" ) parser.add_argument( "--link-format", default="https://github.com/rhasspy/models/releases/download/v1.0/asr_faster-whisper-{model}.tar.gz", help="Format string for download URLs", ) args = parser.parse_args() logging.basicConfig(level=logging.INFO) if args.destination: args.destination = Path(args.destination) else: # Assume we're in programs/asr/faster-whisper/script data_dir = _DIR.parent.parent.parent.parent / "data" args.destination = data_dir / "asr" / "faster-whisper" args.destination.parent.mkdir(parents=True, exist_ok=True) for model in args.model: url = args.link_format.format(model=model) _LOGGER.info("Downloading %s", url) with urlopen(url) as response: with tarfile.open(mode="r|*", fileobj=response) as tar_gz: _LOGGER.info("Extracting to %s", args.destination) tar_gz.extractall(args.destination) if __name__ == "__main__": main() ================================================ FILE: programs/asr/faster-whisper/script/server ================================================ #!/usr/bin/env bash set -eo pipefail # Directory of *this* script this_dir="$( cd "$( dirname "$0" )" && pwd )" # Base directory of repo base_dir="$(realpath "${this_dir}/..")" # Path to virtual environment : "${venv:=${base_dir}/.venv}" if [ -d "${venv}" ]; then source "${venv}/bin/activate" fi socket_dir="${base_dir}/var/run" mkdir -p "${socket_dir}" python3 "${base_dir}/bin/faster_whisper_server.py" --socketfile "${socket_dir}/faster-whisper.socket" "$@" ================================================ FILE: programs/asr/faster-whisper/script/setup ================================================ #!/usr/bin/env bash set -eo pipefail # Directory of *this* script this_dir="$( cd "$( dirname "$0" )" && pwd )" # Base directory of repo base_dir="$(realpath "${this_dir}/..")" # Path to virtual environment : "${venv:=${base_dir}/.venv}" # Python binary to use : "${PYTHON=python3}" python_version="$(${PYTHON} --version)" if [ ! -d "${venv}" ]; then # Create virtual environment echo "Creating virtual environment at ${venv} (${python_version})" rm -rf "${venv}" "${PYTHON}" -m venv "${venv}" source "${venv}/bin/activate" pip3 install --upgrade pip pip3 install --upgrade wheel setuptools else source "${venv}/bin/activate" fi # Install Python dependencies echo 'Installing Python dependencies' pip3 install -e "${base_dir}/src" # Install rhasspy3 rhasspy3_dir="${base_dir}/../../../.." pip3 install -e "${rhasspy3_dir}" # ----------------------------------------------------------------------------- echo "OK" ================================================ FILE: programs/asr/faster-whisper/script/wav2text ================================================ #!/usr/bin/env bash set -eo pipefail # Directory of *this* script this_dir="$( cd "$( dirname "$0" )" && pwd )" # Base directory of repo base_dir="$(realpath "${this_dir}/..")" # Path to virtual environment : "${venv:=${base_dir}/.venv}" if [ -d "${venv}" ]; then source "${venv}/bin/activate" fi python3 "${base_dir}/bin/faster_whisper_wav2text.py" "$@" ================================================ FILE: programs/asr/faster-whisper/src/LICENSE ================================================ MIT License Copyright (c) 2023 Guillaume Klein Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ================================================ FILE: programs/asr/faster-whisper/src/README.md ================================================ # Faster Whisper transcription with CTranslate2 This repository demonstrates how to implement the Whisper transcription using [CTranslate2](https://github.com/OpenNMT/CTranslate2/), which is a fast inference engine for Transformer models. This implementation is about 4 times faster than [openai/whisper](https://github.com/openai/whisper) for the same accuracy while using less memory. The efficiency can be further improved with 8-bit quantization on both CPU and GPU. ## Installation ```bash pip install -e .[conversion] ``` The model conversion requires the modules `transformers` and `torch` which are installed by the `[conversion]` requirement. Once a model is converted, these modules are no longer needed and the installation could be simplified to: ```bash pip install -e . ``` ## Usage ### Model conversion A Whisper model should be first converted into the CTranslate2 format. For example the command below converts the "medium" Whisper model and saves the weights in FP16: ```bash ct2-transformers-converter --model openai/whisper-medium --output_dir whisper-medium-ct2 --quantization float16 ``` If needed, models can also be converted from the code. See the [conversion API](https://opennmt.net/CTranslate2/python/ctranslate2.converters.TransformersConverter.html). ### Transcription ```python from faster_whisper import WhisperModel model_path = "whisper-medium-ct2/" # Run on GPU with FP16 model = WhisperModel(model_path, device="cuda", compute_type="float16") # or run on GPU with INT8 # model = WhisperModel(model_path, device="cuda", compute_type="int8_float16") # or run on CPU with INT8 # model = WhisperModel(model_path, device="cpu", compute_type="int8") segments, info = model.transcribe("audio.mp3", beam_size=5) print("Detected language '%s' with probability %f" % (info.language, info.language_probability)) for segment in segments: print("[%ds -> %ds] %s" % (segment.start, segment.end, segment.text)) ``` ## Comparing performance against openai/whisper If you are comparing the performance against [openai/whisper](https://github.com/openai/whisper), you should make sure to use the same settings in both frameworks. In particular: * In openai/whisper, `model.transcribe` uses a beam size of 1 by default. A different beam size will have an important impact on performance so make sure to use the same. * When running on CPU, make sure to set the same number of threads. Both frameworks will read the environment variable `OMP_NUM_THREADS`, which can be set when running your script: ```bash OMP_NUM_THREADS=4 python3 my_script.py ``` ================================================ FILE: programs/asr/faster-whisper/src/faster_whisper/__init__.py ================================================ from faster_whisper.transcribe import WhisperModel ================================================ FILE: programs/asr/faster-whisper/src/faster_whisper/audio.py ================================================ import av import numpy as np def decode_audio(input_file, sampling_rate=16000): """Decodes the audio. Args: input_file: Path to the input file or a file-like object. sampling_rate: Resample the audio to this sample rate. Returns: A float32 Numpy array. """ fifo = av.audio.fifo.AudioFifo() resampler = av.audio.resampler.AudioResampler( format="s16", layout="mono", rate=sampling_rate, ) with av.open(input_file) as container: # Decode and resample each audio frame. for frame in container.decode(audio=0): frame.pts = None for new_frame in resampler.resample(frame): fifo.write(new_frame) # Flush the resampler. for new_frame in resampler.resample(None): fifo.write(new_frame) frame = fifo.read() # Convert s16 back to f32. return frame.to_ndarray().flatten().astype(np.float32) / 32768.0 ================================================ FILE: programs/asr/faster-whisper/src/faster_whisper/feature_extractor.py ================================================ import numpy as np # Adapted from https://github.com/huggingface/transformers/blob/main/src/transformers/models/whisper/feature_extraction_whisper.py class FeatureExtractor: def __init__( self, feature_size=80, sampling_rate=16000, hop_length=160, chunk_length=30, n_fft=400, ): self.n_fft = n_fft self.hop_length = hop_length self.chunk_length = chunk_length self.n_samples = chunk_length * sampling_rate self.nb_max_frames = self.n_samples // hop_length self.time_per_frame = hop_length / sampling_rate self.sampling_rate = sampling_rate self.mel_filters = self.get_mel_filters( sampling_rate, n_fft, n_mels=feature_size ) def get_mel_filters(self, sr, n_fft, n_mels=128, dtype=np.float32): # Initialize the weights n_mels = int(n_mels) weights = np.zeros((n_mels, int(1 + n_fft // 2)), dtype=dtype) # Center freqs of each FFT bin fftfreqs = np.fft.rfftfreq(n=n_fft, d=1.0 / sr) # 'Center freqs' of mel bands - uniformly spaced between limits min_mel = 0.0 max_mel = 45.245640471924965 mels = np.linspace(min_mel, max_mel, n_mels + 2) mels = np.asanyarray(mels) # Fill in the linear scale f_min = 0.0 f_sp = 200.0 / 3 freqs = f_min + f_sp * mels # And now the nonlinear scale min_log_hz = 1000.0 # beginning of log region (Hz) min_log_mel = (min_log_hz - f_min) / f_sp # same (Mels) logstep = np.log(6.4) / 27.0 # step size for log region # If we have vector data, vectorize log_t = mels >= min_log_mel freqs[log_t] = min_log_hz * np.exp(logstep * (mels[log_t] - min_log_mel)) mel_f = freqs fdiff = np.diff(mel_f) ramps = np.subtract.outer(mel_f, fftfreqs) for i in range(n_mels): # lower and upper slopes for all bins lower = -ramps[i] / fdiff[i] upper = ramps[i + 2] / fdiff[i + 1] # .. then intersect them with each other and zero weights[i] = np.maximum(0, np.minimum(lower, upper)) # Slaney-style mel is scaled to be approx constant energy per channel enorm = 2.0 / (mel_f[2 : n_mels + 2] - mel_f[:n_mels]) weights *= enorm[:, np.newaxis] return weights def fram_wave(self, waveform, center=True): """ Transform a raw waveform into a list of smaller waveforms. The window length defines how much of the signal is contain in each frame (smalle waveform), while the hope length defines the step between the beginning of each new frame. Centering is done by reflecting the waveform which is first centered around `frame_idx * hop_length`. """ frames = [] for i in range(0, waveform.shape[0] + 1, self.hop_length): half_window = (self.n_fft - 1) // 2 + 1 if center: start = i - half_window if i > half_window else 0 end = ( i + half_window if i < waveform.shape[0] - half_window else waveform.shape[0] ) frame = waveform[start:end] if start == 0: padd_width = (-i + half_window, 0) frame = np.pad(frame, pad_width=padd_width, mode="reflect") elif end == waveform.shape[0]: padd_width = (0, (i - waveform.shape[0] + half_window)) frame = np.pad(frame, pad_width=padd_width, mode="reflect") else: frame = waveform[i : i + self.n_fft] frame_width = frame.shape[0] if frame_width < waveform.shape[0]: frame = np.lib.pad( frame, pad_width=(0, self.n_fft - frame_width), mode="constant", constant_values=0, ) frames.append(frame) return np.stack(frames, 0) def stft(self, frames, window): """ Calculates the complex Short-Time Fourier Transform (STFT) of the given framed signal. Should give the same results as `torch.stft`. """ frame_size = frames.shape[1] fft_size = self.n_fft if fft_size is None: fft_size = frame_size if fft_size < frame_size: raise ValueError("FFT size must greater or equal the frame size") # number of FFT bins to store num_fft_bins = (fft_size >> 1) + 1 data = np.empty((len(frames), num_fft_bins), dtype=np.complex64) fft_signal = np.zeros(fft_size) for f, frame in enumerate(frames): if window is not None: np.multiply(frame, window, out=fft_signal[:frame_size]) else: fft_signal[:frame_size] = frame data[f] = np.fft.fft(fft_signal, axis=0)[:num_fft_bins] return data.T def __call__(self, waveform): """ Compute the log-Mel spectrogram of the provided audio, gives similar results whisper's original torch implementation with 1e-5 tolerance. """ window = np.hanning(self.n_fft + 1)[:-1] frames = self.fram_wave(waveform) stft = self.stft(frames, window=window) magnitudes = np.abs(stft[:, :-1]) ** 2 filters = self.mel_filters mel_spec = filters @ magnitudes log_spec = np.log10(np.clip(mel_spec, a_min=1e-10, a_max=None)) log_spec = np.maximum(log_spec, log_spec.max() - 8.0) log_spec = (log_spec + 4.0) / 4.0 return log_spec ================================================ FILE: programs/asr/faster-whisper/src/faster_whisper/transcribe.py ================================================ import collections import os import zlib import ctranslate2 import numpy as np import tokenizers from faster_whisper.audio import decode_audio from faster_whisper.feature_extractor import FeatureExtractor class Segment(collections.namedtuple("Segment", ("start", "end", "text"))): pass class AudioInfo( collections.namedtuple("AudioInfo", ("language", "language_probability")) ): pass class TranscriptionOptions( collections.namedtuple( "TranscriptionOptions", ( "beam_size", "best_of", "patience", "log_prob_threshold", "no_speech_threshold", "compression_ratio_threshold", "condition_on_previous_text", "temperatures", ), ) ): pass class WhisperModel: def __init__( self, model_path, device="auto", compute_type="default", cpu_threads=0, ): """Initializes the Whisper model. Args: model_path: Path to the converted model. device: Device to use for computation ("cpu", "cuda", "auto"). compute_type: Type to use for computation. See https://opennmt.net/CTranslate2/quantization.html. cpu_threads: Number of threads to use when running on CPU (4 by default). A non zero value overrides the OMP_NUM_THREADS environment variable. """ self.model = ctranslate2.models.Whisper( model_path, device=device, compute_type=compute_type, intra_threads=cpu_threads, ) self.feature_extractor = FeatureExtractor() self.decoder = tokenizers.decoders.ByteLevel() with open(os.path.join(model_path, "vocabulary.txt")) as vocab_file: self.ids_to_tokens = [line.rstrip("\n") for line in vocab_file] self.tokens_to_ids = { token: i for i, token in enumerate(self.ids_to_tokens) } self.eot_id = self.tokens_to_ids["<|endoftext|>"] self.timestamp_begin_id = self.tokens_to_ids["<|notimestamps|>"] + 1 self.input_stride = 2 self.time_precision = 0.02 self.max_length = 448 def transcribe( self, input_file, language=None, beam_size=5, best_of=5, patience=1, temperature=[0.0, 0.2, 0.4, 0.6, 0.8, 1.0], compression_ratio_threshold=2.4, log_prob_threshold=-1.0, no_speech_threshold=0.6, condition_on_previous_text=True, ): """Transcribes an input file. Arguments: input_file: Path to the input file or a file-like object. language: The language spoken in the audio. If not set, the language will be detected in the first 30 seconds of audio. beam_size: Beam size to use for decoding. best_of: Number of candidates when sampling with non-zero temperature. patience: Beam search patience factor. temperature: Temperature for sampling. It can be a tuple of temperatures, which will be successively used upon failures according to either `compression_ratio_threshold` or `logprob_threshold`. compression_ratio_threshold: If the gzip compression ratio is above this value, treat as failed. log_prob_threshold: If the average log probability over sampled tokens is below this value, treat as failed. no_speech_threshold: If the no_speech probability is higher than this value AND the average log probability over sampled tokens is below `logprob_threshold`, consider the segment as silent. condition_on_previous_text: If True, the previous output of the model is provided as a prompt for the next window; disabling may make the text inconsistent across windows, but the model becomes less prone to getting stuck in a failure loop, such as repetition looping or timestamps going out of sync. Returns: A tuple with: - a generator over transcribed segments - an instance of AudioInfo """ audio = decode_audio( input_file, sampling_rate=self.feature_extractor.sampling_rate ) features = self.feature_extractor(audio) if language is None: segment = self.get_segment(features) input = self.get_input(segment) results = self.model.detect_language(input) language_token, language_probability = results[0][0] language = language_token[2:-2] else: language_probability = 1 options = TranscriptionOptions( beam_size=beam_size, best_of=best_of, patience=patience, log_prob_threshold=log_prob_threshold, no_speech_threshold=no_speech_threshold, compression_ratio_threshold=compression_ratio_threshold, condition_on_previous_text=condition_on_previous_text, temperatures=( temperature if isinstance(temperature, (list, tuple)) else [temperature] ), ) segments = self.generate_segments(features, language, options) audio_info = AudioInfo( language=language, language_probability=language_probability, ) return segments, audio_info def generate_segments(self, features, language, options): tokenized_segments = self.generate_tokenized_segments( features, language, options ) for start, end, tokens in tokenized_segments: text = self.decode_text_tokens(tokens) if not text.strip(): continue yield Segment( start=start, end=end, text=text, ) def generate_tokenized_segments(self, features, language, options): num_frames = features.shape[-1] offset = 0 all_tokens = [] prompt_reset_since = 0 while offset < num_frames: time_offset = offset * self.feature_extractor.time_per_frame segment = self.get_segment(features, offset) segment_duration = segment.shape[-1] * self.feature_extractor.time_per_frame previous_tokens = all_tokens[prompt_reset_since:] prompt = self.get_prompt(language, previous_tokens) result, temperature = self.generate_with_fallback(segment, prompt, options) if ( result.no_speech_prob > options.no_speech_threshold and result.scores[0] < options.log_prob_threshold ): offset += segment.shape[-1] continue tokens = result.sequences_ids[0] consecutive_timestamps = [ i for i in range(len(tokens)) if i > 0 and tokens[i] >= self.timestamp_begin_id and tokens[i - 1] >= self.timestamp_begin_id ] if len(consecutive_timestamps) > 0: last_slice = 0 for i, current_slice in enumerate(consecutive_timestamps): sliced_tokens = tokens[last_slice:current_slice] start_timestamp_position = ( sliced_tokens[0] - self.timestamp_begin_id ) end_timestamp_position = sliced_tokens[-1] - self.timestamp_begin_id start_time = ( time_offset + start_timestamp_position * self.time_precision ) end_time = ( time_offset + end_timestamp_position * self.time_precision ) last_in_window = i + 1 == len(consecutive_timestamps) # Include the last timestamp so that all tokens are included in a segment. if last_in_window: sliced_tokens.append(tokens[current_slice]) yield start_time, end_time, sliced_tokens last_slice = current_slice last_timestamp_position = ( tokens[last_slice - 1] - self.timestamp_begin_id ) offset += last_timestamp_position * self.input_stride all_tokens.extend(tokens[: last_slice + 1]) else: duration = segment_duration timestamps = [ token for token in tokens if token >= self.timestamp_begin_id ] if len(timestamps) > 0 and timestamps[-1] != self.timestamp_begin_id: last_timestamp_position = timestamps[-1] - self.timestamp_begin_id duration = last_timestamp_position * self.time_precision yield time_offset, time_offset + duration, tokens offset += segment.shape[-1] all_tokens.extend(tokens) if not options.condition_on_previous_text or temperature > 0.5: prompt_reset_since = len(all_tokens) def decode_text_tokens(self, tokens): text_tokens = [ self.ids_to_tokens[token] for token in tokens if token < self.eot_id ] return self.decoder.decode(text_tokens) def generate_with_fallback(self, segment, prompt, options): features = self.get_input(segment) result = None final_temperature = None for temperature in options.temperatures: if temperature > 0: kwargs = { "beam_size": 1, "num_hypotheses": options.best_of, "sampling_topk": 0, "sampling_temperature": temperature, } else: kwargs = { "beam_size": options.beam_size, "patience": options.patience, } final_temperature = temperature result = self.model.generate( features, [prompt], max_length=self.max_length, return_scores=True, return_no_speech_prob=True, **kwargs, )[0] tokens = result.sequences_ids[0] text = self.decode_text_tokens(tokens) compression_ratio = get_compression_ratio(text) if ( compression_ratio <= options.compression_ratio_threshold and result.scores[0] >= options.log_prob_threshold ): break return result, final_temperature def get_prompt(self, language, previous_tokens): prompt = [] if previous_tokens: prompt.append(self.tokens_to_ids["<|startofprev|>"]) prompt.extend(previous_tokens[-(self.max_length // 2 - 1) :]) prompt += [ self.tokens_to_ids["<|startoftranscript|>"], self.tokens_to_ids["<|%s|>" % language], self.tokens_to_ids["<|transcribe|>"], ] return prompt def get_segment(self, features, offset=0): if offset > 0: features = features[:, offset:] num_frames = features.shape[-1] required_num_frames = self.feature_extractor.nb_max_frames if num_frames > required_num_frames: features = features[:, :required_num_frames] elif num_frames < required_num_frames: pad_widths = [(0, 0), (0, required_num_frames - num_frames)] features = np.pad(features, pad_widths) features = np.ascontiguousarray(features) return features def get_input(self, segment): segment = np.expand_dims(segment, 0) segment = ctranslate2.StorageView.from_array(segment) return segment def get_compression_ratio(text): text_bytes = text.encode("utf-8") return len(text_bytes) / len(zlib.compress(text_bytes)) ================================================ FILE: programs/asr/faster-whisper/src/requirements.conversion.txt ================================================ transformers[torch]>=4.23 ================================================ FILE: programs/asr/faster-whisper/src/requirements.txt ================================================ av==10.* ctranslate2>=3.5,<4 tokenizers==0.13.* ================================================ FILE: programs/asr/faster-whisper/src/setup.py ================================================ import os from setuptools import find_packages, setup def get_requirements(path): with open(path, encoding="utf-8") as requirements: return [requirement.strip() for requirement in requirements] base_dir = os.path.dirname(os.path.abspath(__file__)) install_requires = get_requirements(os.path.join(base_dir, "requirements.txt")) conversion_requires = get_requirements( os.path.join(base_dir, "requirements.conversion.txt") ) setup( name="faster-whisper", version="0.1.0", description="Faster Whisper transcription with CTranslate2", author="Guillaume Klein", python_requires=">=3.7", install_requires=install_requires, extras_require={ "conversion": conversion_requires, }, packages=find_packages(), ) ================================================ FILE: programs/asr/pocketsphinx/README.md ================================================ # Pocketsphinx Speech to text service for Rhasspy based on [Pocketsphinx](https://github.com/cmusphinx/pocketsphinx). Additional models can be downloaded here: https://github.com/synesthesiam/voice2json-profiles Model directories should have this layout: * model/ * acoustic_model/ * dictionary.txt * language_model.txt These correspond to the `-hmm`, `-dict`, and `-lm` decoder arguments. ## Installation 1. Copy the contents of this directory to `config/programs/asr/pocketsphinx/` 2. Run `script/setup` 3. Download a model with `script/download.py` * Example: `script/download.py en_cmu` * Models are downloaded to `config/data/asr/pocketsphinx` directory 4. Test with `script/wav2text` * Example `script/wav2text /path/to/en-us_pocketsphinx-cmu/ /path/to/test.wav` ================================================ FILE: programs/asr/pocketsphinx/bin/pocketsphinx_raw2text.py ================================================ #!/usr/bin/env python3 import argparse import logging import sys from pathlib import Path import pocketsphinx _LOGGER = logging.getLogger("pocketsphinx_raw2text") def main() -> None: parser = argparse.ArgumentParser() parser.add_argument("model", help="Path to Pocketsphinx model directory") parser.add_argument( "--samples-per-chunk", type=int, default=1024, help="Number of samples to process at a time", ) parser.add_argument("--debug", action="store_true", help="Log DEBUG messages") args = parser.parse_args() logging.basicConfig(level=logging.DEBUG if args.debug else logging.INFO) model_dir = Path(args.model) _LOGGER.debug("Loading model from %s", model_dir.absolute()) decoder_config = pocketsphinx.Decoder.default_config() decoder_config.set_string("-hmm", str(model_dir / "acoustic_model")) decoder_config.set_string("-dict", str(model_dir / "dictionary.txt")) decoder_config.set_string("-lm", str(model_dir / "language_model.txt")) decoder = pocketsphinx.Decoder(decoder_config) decoder.start_utt() chunk = sys.stdin.buffer.read(args.samples_per_chunk) _LOGGER.debug("Processing audio") while chunk: decoder.process_raw(chunk, False, False) chunk = sys.stdin.buffer.read(args.samples_per_chunk) decoder.end_utt() hyp = decoder.hyp() if hyp: text = hyp.hypstr else: text = "" _LOGGER.debug(text) print(text.strip()) # ----------------------------------------------------------------------------- if __name__ == "__main__": main() ================================================ FILE: programs/asr/pocketsphinx/bin/pocketsphinx_server.py ================================================ #!/usr/bin/env python3 import argparse import json import logging import os import socket import threading from pathlib import Path import pocketsphinx _LOGGER = logging.getLogger("pocketsphinx_server") def main() -> None: parser = argparse.ArgumentParser() parser.add_argument("model", help="Path to Pocketsphinx model directory") parser.add_argument( "--socketfile", required=True, help="Path to Unix domain socket file" ) parser.add_argument( "-r", "--rate", type=int, default=16000, help="Input audio sample rate (default: 16000)", ) parser.add_argument("--debug", action="store_true", help="Log DEBUG messages") args = parser.parse_args() logging.basicConfig(level=logging.DEBUG if args.debug else logging.INFO) # Need to unlink socket if it exists try: os.unlink(args.socketfile) except OSError: pass try: # Create socket server sock = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) sock.bind(args.socketfile) sock.listen() model_dir = Path(args.model) decoder_config = pocketsphinx.Decoder.default_config() decoder_config.set_string("-hmm", str(model_dir / "acoustic_model")) decoder_config.set_string("-dict", str(model_dir / "dictionary.txt")) decoder_config.set_string("-lm", str(model_dir / "language_model.txt")) decoder = pocketsphinx.Decoder(decoder_config) _LOGGER.info("Ready") # Listen for connections while True: try: connection, client_address = sock.accept() _LOGGER.debug("Connection from %s", client_address) # Start new thread for client threading.Thread( target=handle_client, args=(connection, decoder, args.rate), daemon=True, ).start() except KeyboardInterrupt: break except Exception: _LOGGER.exception("Error communicating with socket client") finally: os.unlink(args.socketfile) def handle_client( connection: socket.socket, decoder: pocketsphinx.Decoder, rate: int ) -> None: try: decoder.start_utt() is_first_audio = True with connection, connection.makefile(mode="rwb") as conn_file: while True: event_info = json.loads(conn_file.readline()) event_type = event_info["type"] if event_type == "audio-chunk": if is_first_audio: _LOGGER.debug("Receiving audio") is_first_audio = False num_bytes = event_info["payload_length"] chunk = conn_file.read(num_bytes) decoder.process_raw(chunk, False, False) elif event_type == "audio-stop": _LOGGER.info("Audio stopped") decoder.end_utt() hyp = decoder.hyp() if hyp: text = hyp.hypstr.strip() else: text = "" transcript_str = ( json.dumps( {"type": "transcript", "data": {"text": text}}, ensure_ascii=False, ) + "\n" ) conn_file.write(transcript_str.encode()) break except Exception: _LOGGER.exception("Unexpected error in client thread") # ----------------------------------------------------------------------------- if __name__ == "__main__": main() ================================================ FILE: programs/asr/pocketsphinx/bin/pocketsphinx_wav2text.py ================================================ #!/usr/bin/env python3 import argparse import logging import wave from pathlib import Path import pocketsphinx _LOGGER = logging.getLogger("pocketsphinx_wav2text") def main() -> None: parser = argparse.ArgumentParser() parser.add_argument("model", help="Path to Pocketsphinx model directory") parser.add_argument("wav_file", nargs="+", help="Path to WAV file(s) to transcribe") parser.add_argument("--debug", action="store_true", help="Log DEBUG messages") args = parser.parse_args() logging.basicConfig(level=logging.DEBUG if args.debug else logging.INFO) model_dir = Path(args.model) _LOGGER.debug("Loading model from %s", model_dir.absolute()) decoder_config = pocketsphinx.Decoder.default_config() decoder_config.set_string("-hmm", str(model_dir / "acoustic_model")) decoder_config.set_string("-dict", str(model_dir / "dictionary.txt")) decoder_config.set_string("-lm", str(model_dir / "language_model.txt")) decoder = pocketsphinx.Decoder(decoder_config) for wav_path in args.wav_file: _LOGGER.debug("Processing %s", wav_path) wav_file: wave.Wave_read = wave.open(wav_path, "rb") with wav_file: assert wav_file.getframerate() == 16000, "16Khz sample rate required" assert wav_file.getsampwidth() == 2, "16-bit samples required" assert wav_file.getnchannels() == 1, "Mono audio required" audio_bytes = wav_file.readframes(wav_file.getnframes()) decoder.start_utt() decoder.process_raw(audio_bytes, False, True) decoder.end_utt() hyp = decoder.hyp() if hyp: text = hyp.hypstr else: text = "" print(text.strip()) # ----------------------------------------------------------------------------- if __name__ == "__main__": main() ================================================ FILE: programs/asr/pocketsphinx/requirements.txt ================================================ pocketsphinx @ https://github.com/synesthesiam/pocketsphinx-python/releases/download/v1.0/pocketsphinx-python.tar.gz ================================================ FILE: programs/asr/pocketsphinx/script/download.py ================================================ #!/usr/bin/env python3 import argparse import itertools import logging import tarfile from pathlib import Path from urllib.request import urlopen _DIR = Path(__file__).parent _LOGGER = logging.getLogger("setup") MODELS = {"en_cmu": "en-us_pocketsphinx-cmu"} def main() -> None: parser = argparse.ArgumentParser() parser.add_argument( "model", nargs="+", choices=list(itertools.chain(MODELS.keys(), MODELS.values())), help="Pocketsphinx model(s) to download", ) parser.add_argument("--destination", help="Path to destination directory") parser.add_argument( "--link-format", default="https://github.com/rhasspy/models/releases/download/v1.0/asr_pocketsphinx-{model}.tar.gz", help="Format string for download URLs", ) args = parser.parse_args() logging.basicConfig(level=logging.INFO) if args.destination: args.destination = Path(args.destination) else: # Assume we're in programs/asr/pocketsphinx/script data_dir = _DIR.parent.parent.parent.parent / "data" args.destination = data_dir / "asr" / "pocketsphinx" args.destination.parent.mkdir(parents=True, exist_ok=True) for model in args.model: model = MODELS.get(model, model) url = args.link_format.format(model=model) _LOGGER.info("Downloading %s", url) with urlopen(url) as response: with tarfile.open(mode="r|*", fileobj=response) as tar_gz: _LOGGER.info("Extracting to %s", args.destination) tar_gz.extractall(args.destination) if __name__ == "__main__": main() ================================================ FILE: programs/asr/pocketsphinx/script/raw2text ================================================ #!/usr/bin/env bash set -eo pipefail # Directory of *this* script this_dir="$( cd "$( dirname "$0" )" && pwd )" # Base directory of repo base_dir="$(realpath "${this_dir}/..")" # Path to virtual environment : "${venv:=${base_dir}/.venv}" if [ -d "${venv}" ]; then source "${venv}/bin/activate" fi python3 "${base_dir}/bin/pocketsphinx_raw2text.py" "$@" ================================================ FILE: programs/asr/pocketsphinx/script/server ================================================ #!/usr/bin/env bash set -eo pipefail # Directory of *this* script this_dir="$( cd "$( dirname "$0" )" && pwd )" # Base directory of repo base_dir="$(realpath "${this_dir}/..")" # Path to virtual environment : "${venv:=${base_dir}/.venv}" if [ -d "${venv}" ]; then source "${venv}/bin/activate" fi socket_dir="${base_dir}/var/run" mkdir -p "${socket_dir}" python3 "${base_dir}/bin/pocketsphinx_server.py" --socketfile "${socket_dir}/pocketsphinx.socket" "$@" ================================================ FILE: programs/asr/pocketsphinx/script/setup ================================================ #!/usr/bin/env bash set -eo pipefail # Directory of *this* script this_dir="$( cd "$( dirname "$0" )" && pwd )" # Base directory of repo base_dir="$(realpath "${this_dir}/..")" # Path to virtual environment : "${venv:=${base_dir}/.venv}" # Python binary to use : "${PYTHON=python3}" python_version="$(${PYTHON} --version)" if [ ! -d "${venv}" ]; then # Create virtual environment echo "Creating virtual environment at ${venv} (${python_version})" rm -rf "${venv}" "${PYTHON}" -m venv "${venv}" source "${venv}/bin/activate" pip3 install --upgrade pip pip3 install --upgrade wheel setuptools else source "${venv}/bin/activate" fi # Install Python dependencies echo 'Installing Python dependencies' pip3 install -r "${base_dir}/requirements.txt" # ----------------------------------------------------------------------------- echo "OK" ================================================ FILE: programs/asr/pocketsphinx/script/wav2text ================================================ #!/usr/bin/env bash set -eo pipefail # Directory of *this* script this_dir="$( cd "$( dirname "$0" )" && pwd )" # Base directory of repo base_dir="$(realpath "${this_dir}/..")" # Path to virtual environment : "${venv:=${base_dir}/.venv}" if [ -d "${venv}" ]; then source "${venv}/bin/activate" fi python3 "${base_dir}/bin/pocketsphinx_wav2text.py" "$@" ================================================ FILE: programs/asr/vosk/README.md ================================================ # Vosk Speech to text service for Rhasspy based on [Vosk](https://alphacephei.com/vosk/). You can download additional models here: https://alphacephei.com/vosk/models ## Installation 1. Copy the contents of this directory to `config/programs/asr/vosk/` 2. Run `script/setup` 3. Download a model with `script/download.py` * Example: `script/download.py en_small` * Models are downloaded to `config/data/asr/vosk` directory 4. Test with `script/wav2text` * Example `script/wav2text /path/to/vosk-model-small-en-us-0.15/ /path/to/test.wav` ================================================ FILE: programs/asr/vosk/bin/vosk_raw2text.py ================================================ #!/usr/bin/env python3 import argparse import json import logging import sys from vosk import KaldiRecognizer, Model, SetLogLevel _LOGGER = logging.getLogger("vosk_raw2text") def main() -> None: parser = argparse.ArgumentParser() parser.add_argument("model", help="Path to Vosk model directory") parser.add_argument( "-r", "--rate", type=int, default=16000, help="Model sample rate (default: 16000)", ) parser.add_argument( "--samples-per-chunk", type=int, default=1024, help="Number of samples to process at a time", ) args = parser.parse_args() logging.basicConfig(level=logging.INFO) SetLogLevel(0) model = Model(args.model) recognizer = KaldiRecognizer( model, args.rate, ) chunk = sys.stdin.buffer.read(args.samples_per_chunk) _LOGGER.debug("Processing audio") while chunk: recognizer.AcceptWaveform(chunk) chunk = sys.stdin.buffer.read(args.samples_per_chunk) result = json.loads(recognizer.FinalResult()) print(result["text"].strip()) # ----------------------------------------------------------------------------- if __name__ == "__main__": main() ================================================ FILE: programs/asr/vosk/bin/vosk_server.py ================================================ #!/usr/bin/env python3 import argparse import json import logging import os import socket import threading from vosk import KaldiRecognizer, Model, SetLogLevel _LOGGER = logging.getLogger("vosk_server") def main(): parser = argparse.ArgumentParser() parser.add_argument("model", help="Path to Vosk model directory") parser.add_argument( "--socketfile", required=True, help="Path to Unix domain socket file" ) parser.add_argument( "-r", "--rate", type=int, default=16000, help="Input audio sample rate (default: 16000)", ) parser.add_argument("--debug", action="store_true", help="Log DEBUG messages") args = parser.parse_args() logging.basicConfig(level=logging.DEBUG if args.debug else logging.INFO) # Need to unlink socket if it exists try: os.unlink(args.socketfile) except OSError: pass try: # Create socket server sock = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) sock.bind(args.socketfile) sock.listen() # Load Kaldi model SetLogLevel(0) model = Model(args.model) # Listen for connections while True: try: connection, client_address = sock.accept() _LOGGER.debug("Connection from %s", client_address) # Start new thread for client threading.Thread( target=handle_client, args=(connection, model, args.rate), daemon=True, ).start() except KeyboardInterrupt: break except Exception: _LOGGER.exception("Error communicating with socket client") finally: os.unlink(args.socketfile) def handle_client(connection: socket.socket, model: Model, rate: int) -> None: try: recognizer = KaldiRecognizer( model, rate, ) is_first_audio = True with connection, connection.makefile(mode="rwb") as conn_file: while True: event_info = json.loads(conn_file.readline()) event_type = event_info["type"] if event_type == "audio-chunk": if is_first_audio: _LOGGER.debug("Receiving audio") is_first_audio = False num_bytes = event_info["payload_length"] chunk = conn_file.read(num_bytes) recognizer.AcceptWaveform(chunk) elif event_type == "audio-stop": _LOGGER.info("Audio stopped") result = json.loads(recognizer.FinalResult()) _LOGGER.info(result) text = result["text"] transcript_str = ( json.dumps( {"type": "transcript", "data": {"text": text}}, ensure_ascii=False, ) + "\n" ) conn_file.write(transcript_str.encode()) break except Exception: _LOGGER.exception("Unexpected error in client thread") # ----------------------------------------------------------------------------- if __name__ == "__main__": main() ================================================ FILE: programs/asr/vosk/bin/vosk_wav2text.py ================================================ #!/usr/bin/env python3 import argparse import json import logging import wave from pathlib import Path from vosk import KaldiRecognizer, Model, SetLogLevel _FILE = Path(__file__) _DIR = _FILE.parent _LOGGER = logging.getLogger(_FILE.stem) def main() -> None: parser = argparse.ArgumentParser() parser.add_argument("model", help="Path to Vosk model directory") parser.add_argument("wav_file", nargs="+", help="Path to WAV file(s) to transcribe") parser.add_argument( "-r", "--rate", type=int, default=16000, help="Model sample rate (default: 16000)", ) parser.add_argument( "--samples-per-chunk", type=int, default=1024, help="Number of samples to process at a time", ) args = parser.parse_args() logging.basicConfig(level=logging.INFO) SetLogLevel(0) model = Model(args.model) recognizer = KaldiRecognizer( model, args.rate, ) for wav_path in args.wav_file: _LOGGER.debug("Processing %s", wav_path) wav_file: wave.Wave_read = wave.open(wav_path, "rb") with wav_file: assert wav_file.getframerate() == 16000, "16Khz sample rate required" assert wav_file.getsampwidth() == 2, "16-bit samples required" assert wav_file.getnchannels() == 1, "Mono audio required" audio_bytes = wav_file.readframes(wav_file.getnframes()) recognizer.AcceptWaveform(audio_bytes) result = json.loads(recognizer.FinalResult()) print(result["text"].strip()) # ----------------------------------------------------------------------------- if __name__ == "__main__": main() ================================================ FILE: programs/asr/vosk/requirements.txt ================================================ vosk ================================================ FILE: programs/asr/vosk/script/download.py ================================================ #!/usr/bin/env python3 import argparse import itertools import logging import tarfile from pathlib import Path from urllib.request import urlopen _DIR = Path(__file__).parent _LOGGER = logging.getLogger("setup") MODELS = {"en_medium": "en-us-0.22-lgraph", "en_small": "small-en-us-0.15"} def main() -> None: parser = argparse.ArgumentParser() parser.add_argument( "model", nargs="+", choices=list(itertools.chain(MODELS.keys(), MODELS.values())), help="Vosk model(s) to download", ) parser.add_argument( "--destination", help="Path to destination directory (default: share)" ) parser.add_argument( "--link-format", default="https://github.com/rhasspy/models/releases/download/v1.0/asr_vosk-model-{model}.tar.gz", help="Format string for download URLs", ) args = parser.parse_args() logging.basicConfig(level=logging.INFO) if args.destination: args.destination = Path(args.destination) else: # Assume we're in programs/asr/vosk/script data_dir = _DIR.parent.parent.parent.parent / "data" args.destination = data_dir / "asr" / "vosk" args.destination.parent.mkdir(parents=True, exist_ok=True) for model in args.model: model = MODELS.get(model, model) url = args.link_format.format(model=model) _LOGGER.info("Downloading %s", url) with urlopen(url) as response: with tarfile.open(mode="r|*", fileobj=response) as tar_gz: _LOGGER.info("Extracting to %s", args.destination) tar_gz.extractall(args.destination) if __name__ == "__main__": main() ================================================ FILE: programs/asr/vosk/script/raw2text ================================================ #!/usr/bin/env bash set -eo pipefail # Directory of *this* script this_dir="$( cd "$( dirname "$0" )" && pwd )" # Base directory of repo base_dir="$(realpath "${this_dir}/..")" # Path to virtual environment : "${venv:=${base_dir}/.venv}" if [ -d "${venv}" ]; then source "${venv}/bin/activate" fi python3 "${base_dir}/bin/vosk_raw2text.py" "$@" ================================================ FILE: programs/asr/vosk/script/server ================================================ #!/usr/bin/env bash set -eo pipefail # Directory of *this* script this_dir="$( cd "$( dirname "$0" )" && pwd )" # Base directory of repo base_dir="$(realpath "${this_dir}/..")" # Path to virtual environment : "${venv:=${base_dir}/.venv}" if [ -d "${venv}" ]; then source "${venv}/bin/activate" fi socket_dir="${base_dir}/var/run" mkdir -p "${socket_dir}" python3 "${base_dir}/bin/vosk_server.py" --socketfile "${socket_dir}/vosk.socket" "$@" ================================================ FILE: programs/asr/vosk/script/setup ================================================ #!/usr/bin/env bash set -eo pipefail # Directory of *this* script this_dir="$( cd "$( dirname "$0" )" && pwd )" # Base directory of repo base_dir="$(realpath "${this_dir}/..")" # Path to virtual environment : "${venv:=${base_dir}/.venv}" # Python binary to use : "${PYTHON=python3}" python_version="$(${PYTHON} --version)" if [ ! -d "${venv}" ]; then # Create virtual environment echo "Creating virtual environment at ${venv} (${python_version})" rm -rf "${venv}" "${PYTHON}" -m venv "${venv}" source "${venv}/bin/activate" pip3 install --upgrade pip pip3 install --upgrade wheel setuptools else source "${venv}/bin/activate" fi # Install Python dependencies echo 'Installing Python dependencies' pip3 install -r "${base_dir}/requirements.txt" # ----------------------------------------------------------------------------- echo "OK" ================================================ FILE: programs/asr/vosk/script/wav2text ================================================ #!/usr/bin/env bash set -eo pipefail # Directory of *this* script this_dir="$( cd "$( dirname "$0" )" && pwd )" # Base directory of repo base_dir="$(realpath "${this_dir}/..")" # Path to virtual environment : "${venv:=${base_dir}/.venv}" if [ -d "${venv}" ]; then source "${venv}/bin/activate" fi python3 "${base_dir}/bin/vosk_wav2text.py" "$@" ================================================ FILE: programs/asr/whisper/README.md ================================================ # Whisper Speech to text service for Rhasspy based on [Whisper](https://github.com/openai/whisper). Models are downloaded automatically the first time they're used to the `config/data/asr/whisper` directory. Available models: * tiny.en * tiny * base.en * base * small.en * small * medium.en * medium * large-v1 * large-v2 * large ## Installation 1. Copy the contents of this directory to `config/programs/asr/whisper/` 2. Run `script/setup` 3. Test with `script/wav2text` * Example `script/wav2text 'tiny.en' /path/to/test.wav` ================================================ FILE: programs/asr/whisper/bin/whisper_server.py ================================================ #!/usr/bin/env python3 import argparse import json import logging import os import socket import threading import numpy as np from whisper import Whisper, load_model, transcribe _LOGGER = logging.getLogger("whisper_server") def main(): parser = argparse.ArgumentParser() parser.add_argument("model", help="Name of Whisper model to use") parser.add_argument( "--language", help="Whisper language", ) parser.add_argument("--device", default="cpu", choices=("cpu", "cuda")) parser.add_argument( "--socketfile", required=True, help="Path to Unix domain socket file" ) parser.add_argument("--debug", action="store_true", help="Log DEBUG messages") args = parser.parse_args() logging.basicConfig(level=logging.DEBUG if args.debug else logging.INFO) # Need to unlink socket if it exists try: os.unlink(args.socketfile) except OSError: pass try: # Create socket server sock = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) sock.bind(args.socketfile) sock.listen() _LOGGER.debug("Loading model: %s", args.model) model = load_model(args.model, device=args.device) _LOGGER.info("Ready") # Listen for connections while True: try: connection, client_address = sock.accept() _LOGGER.debug("Connection from %s", client_address) # Start new thread for client threading.Thread( target=handle_client, args=(connection, model, args), daemon=True, ).start() except KeyboardInterrupt: break except Exception: _LOGGER.exception("Error communicating with socket client") finally: os.unlink(args.socketfile) def handle_client( connection: socket.socket, model: Whisper, args: argparse.Namespace ) -> None: try: is_first_audio = True with connection, connection.makefile(mode="rwb") as conn_file: audio_bytes = bytes() while True: event_info = json.loads(conn_file.readline()) event_type = event_info["type"] if event_type == "audio-chunk": if is_first_audio: _LOGGER.debug("Receiving audio") is_first_audio = False num_bytes = event_info["payload_length"] audio_bytes += conn_file.read(num_bytes) elif event_type == "audio-stop": _LOGGER.debug("Audio stopped") break audio_array = np.frombuffer(audio_bytes, dtype=np.int16) audio_array = audio_array.astype(np.float32) / 32768.0 result = transcribe(model, audio_array, language=args.language) _LOGGER.debug(result) text = result["text"] transcript_str = ( json.dumps( {"type": "transcript", "data": {"text": text}}, ensure_ascii=False ) + "\n" ) conn_file.write(transcript_str.encode()) except Exception: _LOGGER.exception("Unexpected error in client thread") # ----------------------------------------------------------------------------- if __name__ == "__main__": main() ================================================ FILE: programs/asr/whisper/bin/whisper_wav2text.py ================================================ #!/usr/bin/env python3 import argparse import logging from whisper import load_model, transcribe _LOGGER = logging.getLogger("whisper_wav2text") def main() -> None: parser = argparse.ArgumentParser() parser.add_argument("model", help="Name of Whisper model to use") parser.add_argument("wav_file", nargs="+", help="Path to WAV file(s) to transcribe") parser.add_argument( "--language", help="Whisper language", ) parser.add_argument("--device", default="cpu", choices=("cpu", "cuda")) # parser.add_argument("--debug", action="store_true", help="Log DEBUG messages") args = parser.parse_args() logging.basicConfig(level=logging.DEBUG if args.debug else logging.INFO) _LOGGER.debug("Loading model: %s", args.model) model = load_model(args.model, device=args.device) for wav_file in args.wav_file: _LOGGER.debug("Processing %s", wav_file) result = transcribe(model, wav_file, language=args.language) _LOGGER.debug(result) text = result["text"] print(text.strip()) # ----------------------------------------------------------------------------- if __name__ == "__main__": main() ================================================ FILE: programs/asr/whisper/requirements.txt ================================================ git+https://github.com/openai/whisper.git ================================================ FILE: programs/asr/whisper/script/server ================================================ #!/usr/bin/env bash set -eo pipefail # Directory of *this* script this_dir="$( cd "$( dirname "$0" )" && pwd )" # Base directory of repo base_dir="$(realpath "${this_dir}/..")" # Path to virtual environment : "${venv:=${base_dir}/.venv}" if [ -d "${venv}" ]; then source "${venv}/bin/activate" fi socket_dir="${base_dir}/var/run" mkdir -p "${socket_dir}" python3 "${base_dir}/bin/whisper_server.py" --socketfile "${socket_dir}/whisper.socket" "$@" ================================================ FILE: programs/asr/whisper/script/setup ================================================ #!/usr/bin/env bash set -eo pipefail # Directory of *this* script this_dir="$( cd "$( dirname "$0" )" && pwd )" # Base directory of repo base_dir="$(realpath "${this_dir}/..")" # Path to virtual environment : "${venv:=${base_dir}/.venv}" # Python binary to use : "${PYTHON=python3}" python_version="$(${PYTHON} --version)" if [ ! -d "${venv}" ]; then # Create virtual environment echo "Creating virtual environment at ${venv} (${python_version})" rm -rf "${venv}" "${PYTHON}" -m venv "${venv}" source "${venv}/bin/activate" pip3 install --upgrade pip pip3 install --upgrade wheel setuptools else source "${venv}/bin/activate" fi # Install Python dependencies echo 'Installing Python dependencies' pip3 install -r "${base_dir}/requirements.txt" # ----------------------------------------------------------------------------- echo "OK" ================================================ FILE: programs/asr/whisper/script/wav2text ================================================ #!/usr/bin/env bash set -eo pipefail # Directory of *this* script this_dir="$( cd "$( dirname "$0" )" && pwd )" # Base directory of repo base_dir="$(realpath "${this_dir}/..")" # Path to virtual environment : "${venv:=${base_dir}/.venv}" if [ -d "${venv}" ]; then source "${venv}/bin/activate" fi python3 "${base_dir}/bin/whisper_wav2text.py" "$@" ================================================ FILE: programs/asr/whisper-cpp/.gitignore ================================================ /build/ ================================================ FILE: programs/asr/whisper-cpp/Dockerfile.libwhisper ================================================ FROM debian:bullseye as build ARG TARGETARCH ARG TARGETVARIANT ENV LANG C.UTF-8 ENV DEBIAN_FRONTEND=noninteractive RUN apt-get update && \ apt-get install --yes build-essential wget WORKDIR /build ARG VERSION=1.1.0 RUN wget "https://github.com/ggerganov/whisper.cpp/archive/refs/tags/v${VERSION}.tar.gz" && \ tar -xzf "v${VERSION}.tar.gz" RUN mv "whisper.cpp-${VERSION}/" 'whisper.cpp' COPY lib/Makefile ./ RUN cd "whisper.cpp" && make -j8 RUN make # ----------------------------------------------------------------------------- FROM scratch COPY --from=build /build/libwhisper.so . ================================================ FILE: programs/asr/whisper-cpp/Dockerfile.libwhisper.dockerignore ================================================ * !lib/Makefile ================================================ FILE: programs/asr/whisper-cpp/README.md ================================================ # Whisper.cpp Speech to text service for Rhasspy based on [whisper.cpp](https://github.com/ggerganov/whisper.cpp/). Additional models can be downloaded here: https://huggingface.co/datasets/ggerganov/whisper.cpp ## Installation 1. Copy the contents of this directory to `config/programs/asr/whisper-cpp/` 2. Run `script/setup.py` 3. Download a model with `script/download.py` * Example: `script/download.py en_tiny` * Models are downloaded to `config/data/asr/whisper-cpp` directory 4. Test with `script/wav2text` * Example `script/wav2text /path/to/ggml-tiny.en.bin /path/to/test.wav` ================================================ FILE: programs/asr/whisper-cpp/bin/whisper_cpp_server.py ================================================ #!/usr/bin/env python3 import argparse import json import logging import os import socket import threading import numpy as np from whisper_cpp import Whisper _LOGGER = logging.getLogger("whisper_cpp_server") def main(): parser = argparse.ArgumentParser() parser.add_argument("model", help="Path to whisper.cpp model file") parser.add_argument( "--socketfile", required=True, help="Path to Unix domain socket file" ) parser.add_argument("--debug", action="store_true", help="Log DEBUG messages") args = parser.parse_args() logging.basicConfig(level=logging.DEBUG if args.debug else logging.INFO) # Need to unlink socket if it exists try: os.unlink(args.socketfile) except OSError: pass try: # Create socket server sock = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) sock.bind(args.socketfile) sock.listen() _LOGGER.debug("Loading model: %s", args.model) with Whisper(args.model) as whisper: _LOGGER.info("Ready") # Listen for connections while True: try: connection, client_address = sock.accept() _LOGGER.debug("Connection from %s", client_address) # Start new thread for client threading.Thread( target=handle_client, args=(connection, whisper), daemon=True, ).start() except KeyboardInterrupt: break except Exception: _LOGGER.exception("Error communicating with socket client") finally: os.unlink(args.socketfile) def handle_client(connection: socket.socket, whisper: Whisper) -> None: try: is_first_audio = True with connection, connection.makefile(mode="rwb") as conn_file: audio_bytes = bytes() while True: event_info = json.loads(conn_file.readline()) event_type = event_info["type"] if event_type == "audio-chunk": if is_first_audio: _LOGGER.debug("Receiving audio") is_first_audio = False num_bytes = event_info["payload_length"] audio_bytes += conn_file.read(num_bytes) elif event_type == "audio-stop": _LOGGER.debug("Audio stopped") break audio_array = np.frombuffer(audio_bytes, dtype=np.int16) audio_array = audio_array.astype(np.float32) / 32768.0 text = " ".join(whisper.transcribe(audio_array)) _LOGGER.debug(text) transcript_str = ( json.dumps( {"type": "transcript", "data": {"text": text}}, ensure_ascii=False ) + "\n" ) conn_file.write(transcript_str.encode()) except Exception: _LOGGER.exception("Unexpected error in client thread") # ----------------------------------------------------------------------------- if __name__ == "__main__": main() ================================================ FILE: programs/asr/whisper-cpp/bin/whisper_cpp_wav2text.py ================================================ #!/usr/bin/env python3 import argparse import audioop import logging import wave from pathlib import Path import numpy as np from whisper_cpp import Whisper _FILE = Path(__file__) _DIR = _FILE.parent _LOGGER = logging.getLogger(_FILE.stem) def main() -> None: parser = argparse.ArgumentParser() parser.add_argument("model", help="Path to whisper.cpp model file") parser.add_argument("wav_file", nargs="+", help="Path to WAV file(s) to transcribe") parser.add_argument( "--debug", action="store_true", help="Print DEBUG messages to console" ) args = parser.parse_args() logging.basicConfig(level=logging.DEBUG if args.debug else logging.INFO) _LOGGER.debug("Loading model: %s", args.model) with Whisper(args.model) as whisper: for wav_path in args.wav_file: wav_file: wave.Wave_read = wave.open(wav_path, "rb") with wav_file: rate = wav_file.getframerate() width = wav_file.getsampwidth() channels = wav_file.getnchannels() audio_bytes = wav_file.readframes(wav_file.getnframes()) if width != 2: audio_bytes = audioop.lin2lin(audio_bytes, width, 2) if channels != 1: audio_bytes = audioop.tomono(audio_bytes, 2, 1.0, 1.0) if rate != 16000: audio_bytes, _state = audioop.ratecv( audio_bytes, 2, 1, rate, 16000, None ) audio_array = np.frombuffer(audio_bytes, dtype=np.int16) audio_array = audio_array.astype(np.float32) / 32768.0 text = " ".join(whisper.transcribe(audio_array)) print(text) # ----------------------------------------------------------------------------- if __name__ == "__main__": main() ================================================ FILE: programs/asr/whisper-cpp/lib/Makefile ================================================ UNAME_M := $(shell uname -m) CFLAGS = -Iwhisper.cpp -O3 -std=c11 -fPIC CXXFLAGS = -Iwhisper.cpp -O3 -std=c++11 -fPIC --shared -static-libstdc++ LDFLAGS = # Linux CFLAGS += -pthread CXXFLAGS += -pthread AVX1_M := $(shell grep "avx " /proc/cpuinfo) ifneq (,$(findstring avx,$(AVX1_M))) CFLAGS += -mavx endif AVX2_M := $(shell grep "avx2 " /proc/cpuinfo) ifneq (,$(findstring avx2,$(AVX2_M))) CFLAGS += -mavx2 endif FMA_M := $(shell grep "fma " /proc/cpuinfo) ifneq (,$(findstring fma,$(FMA_M))) CFLAGS += -mfma endif F16C_M := $(shell grep "f16c " /proc/cpuinfo) ifneq (,$(findstring f16c,$(F16C_M))) CFLAGS += -mf16c endif SSE3_M := $(shell grep "sse3 " /proc/cpuinfo) ifneq (,$(findstring sse3,$(SSE3_M))) CFLAGS += -msse3 endif # amd64 and arm64 only ifeq ($(UNAME_M),amd64) CFLAGS += -mavx -mavx2 -mfma -mf16c endif ifneq ($(filter armv8%,$(UNAME_M)),) # Raspberry Pi 4 CFLAGS += -mfp16-format=ieee -mno-unaligned-access endif default: libwhisper.so libwhisper.so: whisper.cpp/ggml.o whisper.cpp/whisper.cpp $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS) ================================================ FILE: programs/asr/whisper-cpp/lib/whisper_cpp.py ================================================ import ctypes from pathlib import Path from typing import Iterable, Union import numpy as np # Must match struct in whisper.h class WhisperFullParams(ctypes.Structure): _fields_ = [ ("strategy", ctypes.c_int), # ("n_max_text_ctx", ctypes.c_int), ("n_threads", ctypes.c_int), ("offset_ms", ctypes.c_int), ("duration_ms", ctypes.c_int), # ("translate", ctypes.c_bool), ("no_context", ctypes.c_bool), ("single_segment", ctypes.c_bool), ("print_special", ctypes.c_bool), ("print_progress", ctypes.c_bool), ("print_realtime", ctypes.c_bool), ("print_timestamps", ctypes.c_bool), # ("token_timestamps", ctypes.c_bool), ("thold_pt", ctypes.c_float), ("thold_ptsum", ctypes.c_float), ("max_len", ctypes.c_int), ("max_tokens", ctypes.c_int), # ("speed_up", ctypes.c_bool), ("audio_ctx", ctypes.c_int), # ("prompt_tokens", ctypes.c_void_p), ("prompt_n_tokens", ctypes.c_int), # ("language", ctypes.c_char_p), # ("suppress_blank", ctypes.c_bool), # ("temperature_inc", ctypes.c_float), ("entropy_thold", ctypes.c_float), ("logprob_thold", ctypes.c_float), ("no_speech_thold", ctypes.c_float), # ("greedy", ctypes.c_int * 1), ("beam_search", ctypes.c_int * 3), # ("new_segment_callback", ctypes.c_void_p), ("new_segment_callback_user_data", ctypes.c_void_p), # ("encoder_begin_callback", ctypes.c_void_p), ("encoder_begin_callback_user_data", ctypes.c_void_p), ] class WhisperError(Exception): pass class Whisper: def __init__( self, model_path: Union[str, Path], libwhisper_path: Union[str, Path] = "libwhisper.so", language: str = "en", ): self.model_path = Path(model_path) self.whisper = ctypes.CDLL(str(libwhisper_path)) # Set return types self.whisper.whisper_init_from_file.restype = ctypes.c_void_p self.whisper.whisper_full_default_params.restype = WhisperFullParams self.whisper.whisper_full_get_segment_text.restype = ctypes.c_char_p # initialize whisper.cpp context filename_bytes = str(self.model_path.absolute()).encode("utf-8") self.ctx = self.whisper.whisper_init_from_file(filename_bytes) # get default whisper parameters and adjust as needed self.params = self.whisper.whisper_full_default_params() self.params.print_realtime = False self.params.print_special = False self.params.print_progress = False self.params.print_timestamps = False self.params.language = language.encode("utf-8") def transcribe(self, audio_array: np.ndarray) -> Iterable[str]: """Transcribe float32 audio in [0, 1] to text.""" result = self.whisper.whisper_full( ctypes.c_void_p(self.ctx), self.params, audio_array.ctypes.data_as(ctypes.POINTER(ctypes.c_float)), len(audio_array), ) if result != 0: raise WhisperError(str(result)) num_segments = self.whisper.whisper_full_n_segments(ctypes.c_void_p(self.ctx)) for i in range(num_segments): text_bytes = self.whisper.whisper_full_get_segment_text( ctypes.c_void_p(self.ctx), i ) yield text_bytes.decode("utf-8") def __enter__(self): return self def __exit__(self, exc_type, exc_value, traceback): self.whisper.whisper_free(ctypes.c_void_p(self.ctx)) self.whisper = None self.ctx = None ================================================ FILE: programs/asr/whisper-cpp/requirements.txt ================================================ numpy ================================================ FILE: programs/asr/whisper-cpp/script/build_libwhisper ================================================ #!/usr/bin/env bash set -eo pipefail # Directory of *this* script this_dir="$( cd "$( dirname "$0" )" && pwd )" # Base directory of repo base_dir="$(realpath "${this_dir}/..")" docker buildx build "${base_dir}" \ -f "${base_dir}/Dockerfile.libwhisper" \ --platform 'linux/amd64,linux/arm64' \ --output "type=local,dest=${base_dir}/build/" # ----------------------------------------------------------------------------- echo "Copy the appropriate libwhisper.so from build/ to lib/" ================================================ FILE: programs/asr/whisper-cpp/script/download.py ================================================ #!/usr/bin/env python3 import argparse import logging import tarfile from pathlib import Path from urllib.request import urlopen _DIR = Path(__file__).parent _LOGGER = logging.getLogger("setup") MODELS = ["tiny.en", "base.en"] def main() -> None: parser = argparse.ArgumentParser() parser.add_argument( "model", nargs="+", choices=MODELS, help="Pocketsphinx model(s) to download", ) parser.add_argument( "--destination", help="Path to destination directory (default: share)" ) parser.add_argument( "--link-format", default="https://github.com/rhasspy/models/releases/download/v1.0/asr_whisper-cpp-ggml-{model}.tar.gz", help="Format string for download URLs", ) args = parser.parse_args() logging.basicConfig(level=logging.INFO) if args.destination: args.destination = Path(args.destination) else: # Assume we're in programs/asr/whisper-cpp/script data_dir = _DIR.parent.parent.parent.parent / "data" args.destination = data_dir / "asr" / "whisper-cpp" args.destination.parent.mkdir(parents=True, exist_ok=True) for model in args.model: url = args.link_format.format(model=model) _LOGGER.info("Downloading %s", url) with urlopen(url) as response: with tarfile.open(mode="r|*", fileobj=response) as tar_gz: _LOGGER.info("Extracting to %s", args.destination) tar_gz.extractall(args.destination) if __name__ == "__main__": main() ================================================ FILE: programs/asr/whisper-cpp/script/server ================================================ #!/usr/bin/env bash set -eo pipefail # Directory of *this* script this_dir="$( cd "$( dirname "$0" )" && pwd )" # Base directory of repo base_dir="$(realpath "${this_dir}/..")" # Path to virtual environment : "${venv:=${base_dir}/.venv}" if [ -d "${venv}" ]; then source "${venv}/bin/activate" fi socket_dir="${base_dir}/var/run" mkdir -p "${socket_dir}" lib_dir="${base_dir}/lib" export LD_LIBRARY_PATH="${lib_dir}:${LD_LIBRARY_PATH}" export PYTHONPATH="${lib_dir}:${PYTHONPATH}" python3 "${base_dir}/bin/whisper_cpp_server.py" --socketfile "${socket_dir}/whisper-cpp.socket" "$@" ================================================ FILE: programs/asr/whisper-cpp/script/setup ================================================ #!/usr/bin/env bash set -eo pipefail # Directory of *this* script this_dir="$( cd "$( dirname "$0" )" && pwd )" # Base directory of repo base_dir="$(realpath "${this_dir}/..")" # Path to virtual environment : "${venv:=${base_dir}/.venv}" # Python binary to use : "${PYTHON=python3}" python_version="$(${PYTHON} --version)" if [ ! -d "${venv}" ]; then # Create virtual environment echo "Creating virtual environment at ${venv} (${python_version})" rm -rf "${venv}" "${PYTHON}" -m venv "${venv}" source "${venv}/bin/activate" pip3 install --upgrade pip pip3 install --upgrade wheel setuptools else source "${venv}/bin/activate" fi # Install Python dependencies echo 'Installing Python dependencies' pip3 install -r "${base_dir}/requirements.txt" python3 "${this_dir}/setup.py" # ----------------------------------------------------------------------------- echo "OK" ================================================ FILE: programs/asr/whisper-cpp/script/setup.py ================================================ #!/usr/bin/env python3 import argparse import logging import platform import tarfile from pathlib import Path from urllib.request import urlopen _DIR = Path(__file__).parent _LOGGER = logging.getLogger("setup") PLATFORMS = {"x86_64": "amd64", "aarch64": "arm64"} def main() -> None: parser = argparse.ArgumentParser() parser.add_argument( "--platform", help="CPU architecture to download (amd64, arm64)", ) parser.add_argument( "--destination", help="Path to destination directory (default: lib)" ) parser.add_argument( "--link-format", default="https://github.com/rhasspy/models/releases/download/v1.0/libwhisper_{platform}.tar.gz", help="Format string for download URLs", ) args = parser.parse_args() logging.basicConfig(level=logging.INFO) if not args.platform: args.platform = platform.machine() args.platform = PLATFORMS.get(args.platform, args.platform) if not args.destination: args.destination = _DIR.parent / "lib" args.destination.parent.mkdir(parents=True, exist_ok=True) url = args.link_format.format(platform=args.platform) _LOGGER.info("Downloading %s", url) with urlopen(url) as response: with tarfile.open(mode="r|*", fileobj=response) as tar_gz: _LOGGER.info("Extracting to %s", args.destination) tar_gz.extractall(args.destination) if __name__ == "__main__": main() ================================================ FILE: programs/asr/whisper-cpp/script/wav2text ================================================ #!/usr/bin/env bash set -eo pipefail # Directory of *this* script this_dir="$( cd "$( dirname "$0" )" && pwd )" # Base directory of repo base_dir="$(realpath "${this_dir}/..")" # Path to virtual environment : "${venv:=${base_dir}/.venv}" if [ -d "${venv}" ]; then source "${venv}/bin/activate" fi lib_dir="${base_dir}/lib" export LD_LIBRARY_PATH="${lib_dir}:${LD_LIBRARY_PATH}" export PYTHONPATH="${lib_dir}:${PYTHONPATH}" python3 "${base_dir}/bin/whisper_cpp_wav2text.py" "$@" ================================================ FILE: programs/handle/date_time/bin/date_time.py ================================================ #!/usr/bin/env python3 import re import sys from datetime import datetime def main() -> None: text = sys.stdin.read().strip().lower() words = [re.sub(r"\W", "", word) for word in text.split()] now = datetime.now() if "time" in words: print(now.strftime("%I:%M %p")) elif "date" in words: print(now.strftime("%A, %B %d, %Y")) if __name__ == "__main__": main() ================================================ FILE: programs/handle/home_assistant/bin/converse.py ================================================ #!/usr/bin/env python3 import argparse import json import logging import sys from pathlib import Path from urllib.request import Request, urlopen _FILE = Path(__file__) _DIR = _FILE.parent _LOGGER = logging.getLogger(_FILE.stem) def main(): parser = argparse.ArgumentParser() parser.add_argument( "url", help="URL of API endpoint", ) parser.add_argument("token_file", help="Path to file with authorization token") parser.add_argument("--language", help="Language code to use") parser.add_argument( "--debug", action="store_true", help="Print DEBUG messages to console" ) args = parser.parse_args() logging.basicConfig(level=logging.DEBUG if args.debug else logging.INFO) token = Path(args.token_file).read_text(encoding="utf-8").strip() headers = {"Authorization": f"Bearer {token}"} data_dict = {"text": sys.stdin.read()} if args.language: data_dict["language"] = args.language data = json.dumps(data_dict, ensure_ascii=False).encode("utf-8") request = Request(args.url, data=data, headers=headers) with urlopen(request) as response: response = json.loads(response.read()) response_text = ( response.get("response", {}) .get("speech", {}) .get("plain", {}) .get("speech", "") ) print(response_text) if __name__ == "__main__": main() ================================================ FILE: programs/intent/regex/bin/regex.py ================================================ #!/usr/bin/env python3 import argparse import re from collections import defaultdict from typing import Dict, List, Optional from rhasspy3.event import read_event, write_event from rhasspy3.intent import Entity, Intent, NotRecognized, Recognize def main() -> None: parser = argparse.ArgumentParser() parser.add_argument( "-i", "--intent", required=True, nargs=2, metavar=("name", "regex"), action="append", default=[], help="Intent name and regex", ) args = parser.parse_args() # intent name -> [pattern] patterns: Dict[str, List[re.Pattern]] = defaultdict(list) for intent_name, pattern_str in args.intent: patterns[intent_name].append(re.compile(pattern_str, re.IGNORECASE)) try: while True: event = read_event() if event is None: break if Recognize.is_type(event.type): recognize = Recognize.from_event(event) text = _clean(recognize.text) intent = _recognize(text, patterns) if intent is None: write_event(NotRecognized().event()) else: write_event(intent.event()) except KeyboardInterrupt: pass def _clean(text: str) -> str: text = " ".join(text.split()) return text def _recognize(text: str, patterns: Dict[str, List[re.Pattern]]) -> Optional[Intent]: for intent_name, intent_patterns in patterns.items(): for intent_pattern in intent_patterns: match = intent_pattern.match(text) if match is None: continue return Intent( name=intent_name, entities=[ Entity(name=name, value=value) for name, value in match.groupdict().items() ], ) return None if __name__ == "__main__": main() ================================================ FILE: programs/mic/pyaudio/README.md ================================================ # PyAudio Audio input service for Rhasspy based on [PyAudio](https://people.csail.mit.edu/hubert/pyaudio/docs/). ================================================ FILE: programs/mic/pyaudio/bin/pyaudio_events.py ================================================ #!/usr/bin/env python3 import argparse import logging from pathlib import Path from pyaudio_shared import iter_chunks from rhasspy3.audio import ( DEFAULT_IN_CHANNELS, DEFAULT_IN_RATE, DEFAULT_IN_WIDTH, DEFAULT_SAMPLES_PER_CHUNK, AudioChunk, ) from rhasspy3.event import write_event _FILE = Path(__file__) _DIR = _FILE.parent _LOGGER = logging.getLogger(_FILE.stem) def main() -> None: parser = argparse.ArgumentParser() parser.add_argument( "--rate", type=int, default=DEFAULT_IN_RATE, help="Sample rate (hertz)" ) parser.add_argument( "--width", type=int, default=DEFAULT_IN_WIDTH, help="Sample width (bytes)" ) parser.add_argument( "--channels", type=int, default=DEFAULT_IN_CHANNELS, help="Sample channel count" ) parser.add_argument( "--samples-per-chunk", type=int, default=DEFAULT_SAMPLES_PER_CHUNK, help="Number of samples to process at a time", ) parser.add_argument("--device", help="Name or index of device to use") # parser.add_argument( "--debug", action="store_true", help="Print DEBUG messages to console" ) args = parser.parse_args() logging.basicConfig(level=logging.DEBUG if args.debug else logging.INFO) for chunk in iter_chunks( args.device, args.rate, args.width, args.channels, args.samples_per_chunk ): write_event(AudioChunk(args.rate, args.width, args.channels, chunk).event()) if __name__ == "__main__": main() ================================================ FILE: programs/mic/pyaudio/bin/pyaudio_list_mics.py ================================================ #!/usr/bin/env python3 import pyaudio def main() -> None: audio_system = pyaudio.PyAudio() for i in range(audio_system.get_device_count()): print(audio_system.get_device_info_by_index(i)) if __name__ == "__main__": main() ================================================ FILE: programs/mic/pyaudio/bin/pyaudio_raw.py ================================================ #!/usr/bin/env python3 import argparse import logging import sys from pathlib import Path from pyaudio_shared import iter_chunks _FILE = Path(__file__) _DIR = _FILE.parent _LOGGER = logging.getLogger(_FILE.stem) def main() -> None: parser = argparse.ArgumentParser() parser.add_argument("--rate", type=int, required=True, help="Sample rate (hertz)") parser.add_argument("--width", type=int, required=True, help="Sample width (bytes)") parser.add_argument( "--channels", type=int, required=True, help="Sample channel count" ) parser.add_argument( "--samples-per-chunk", type=int, required=True, help="Number of samples to process at a time", ) parser.add_argument("--device", help="Name or index of device to use") # parser.add_argument( "--debug", action="store_true", help="Print DEBUG messages to console" ) args = parser.parse_args() logging.basicConfig(level=logging.DEBUG if args.debug else logging.INFO) for chunk in iter_chunks( args.device, args.rate, args.width, args.channels, args.samples_per_chunk ): sys.stdout.buffer.write(chunk) if __name__ == "__main__": main() ================================================ FILE: programs/mic/pyaudio/bin/pyaudio_shared.py ================================================ import logging from pathlib import Path from typing import Iterable, Optional, Union import pyaudio _FILE = Path(__file__) _DIR = _FILE.parent _LOGGER = logging.getLogger(_FILE.stem) def iter_chunks( device: Optional[Union[int, str]], rate: int, width: int, channels: int, samples_per_chunk: int, ) -> Iterable[bytes]: """Open input stream and yield audio chunks.""" audio_system = pyaudio.PyAudio() try: if isinstance(device, str): try: device = int(device) except ValueError: for i in range(audio_system.get_device_count()): info = audio_system.get_device_info_by_index(i) if device == info["name"]: device = i break assert device is not None, f"No device named: {device}" _LOGGER.debug("Device: %s", device) stream = audio_system.open( input_device_index=device, format=audio_system.get_format_from_width(width), channels=channels, rate=rate, input=True, frames_per_buffer=samples_per_chunk, ) chunk = stream.read(samples_per_chunk) while chunk: yield chunk chunk = stream.read(samples_per_chunk) except KeyboardInterrupt: pass finally: audio_system.terminate() ================================================ FILE: programs/mic/pyaudio/requirements.txt ================================================ pyaudio ================================================ FILE: programs/mic/pyaudio/script/events ================================================ #!/usr/bin/env bash set -eo pipefail # Directory of *this* script this_dir="$( cd "$( dirname "$0" )" && pwd )" # Base directory of repo base_dir="$(realpath "${this_dir}/..")" # Path to virtual environment : "${venv:=${base_dir}/.venv}" if [ -d "${venv}" ]; then source "${venv}/bin/activate" fi python3 "${base_dir}/bin/pyaudio_events.py" "$@" ================================================ FILE: programs/mic/pyaudio/script/list_mics ================================================ #!/usr/bin/env bash set -eo pipefail # Directory of *this* script this_dir="$( cd "$( dirname "$0" )" && pwd )" # Base directory of repo base_dir="$(realpath "${this_dir}/..")" # Path to virtual environment : "${venv:=${base_dir}/.venv}" if [ -d "${venv}" ]; then source "${venv}/bin/activate" fi python3 "${base_dir}/bin/pyaudio_list_mics.py" "$@" ================================================ FILE: programs/mic/pyaudio/script/raw ================================================ #!/usr/bin/env bash set -eo pipefail # Directory of *this* script this_dir="$( cd "$( dirname "$0" )" && pwd )" # Base directory of repo base_dir="$(realpath "${this_dir}/..")" # Path to virtual environment : "${venv:=${base_dir}/.venv}" if [ -d "${venv}" ]; then source "${venv}/bin/activate" fi python3 "${base_dir}/bin/pyaudio_raw.py" "$@" ================================================ FILE: programs/mic/pyaudio/script/setup ================================================ #!/usr/bin/env bash set -eo pipefail # Directory of *this* script this_dir="$( cd "$( dirname "$0" )" && pwd )" # Base directory of repo base_dir="$(realpath "${this_dir}/..")" # Path to virtual environment : "${venv:=${base_dir}/.venv}" # Python binary to use : "${PYTHON=python3}" python_version="$(${PYTHON} --version)" if [ ! -d "${venv}" ]; then # Create virtual environment echo "Creating virtual environment at ${venv} (${python_version})" rm -rf "${venv}" "${PYTHON}" -m venv "${venv}" source "${venv}/bin/activate" pip3 install --upgrade pip pip3 install --upgrade wheel setuptools else source "${venv}/bin/activate" fi # Install Python dependencies echo 'Installing Python dependencies' pip3 install -r "${base_dir}/requirements.txt" # Install rhasspy3 rhasspy3_dir="${base_dir}/../../../.." pip3 install -e "${rhasspy3_dir}" # ----------------------------------------------------------------------------- echo "OK" ================================================ FILE: programs/mic/sounddevice/README.md ================================================ # sounddevice Audio input service for Rhasspy based on [sounddevice](https://python-sounddevice.readthedocs.io). ================================================ FILE: programs/mic/sounddevice/bin/sounddevice_events.py ================================================ #!/usr/bin/env python3 import argparse import logging from pathlib import Path from sounddevice_shared import iter_chunks from rhasspy3.audio import ( DEFAULT_IN_CHANNELS, DEFAULT_IN_RATE, DEFAULT_IN_WIDTH, DEFAULT_SAMPLES_PER_CHUNK, AudioChunk, ) from rhasspy3.event import write_event _FILE = Path(__file__) _DIR = _FILE.parent _LOGGER = logging.getLogger(_FILE.stem) def main() -> None: parser = argparse.ArgumentParser() parser.add_argument( "--rate", type=int, default=DEFAULT_IN_RATE, help="Sample rate (hertz)" ) parser.add_argument( "--width", type=int, default=DEFAULT_IN_WIDTH, help="Sample width (bytes)" ) parser.add_argument( "--channels", type=int, default=DEFAULT_IN_CHANNELS, help="Sample channel count" ) parser.add_argument( "--samples-per-chunk", type=int, default=DEFAULT_SAMPLES_PER_CHUNK, help="Number of samples to process at a time", ) parser.add_argument("--device", help="Name or index of device to use") # parser.add_argument( "--debug", action="store_true", help="Print DEBUG messages to console" ) args = parser.parse_args() logging.basicConfig(level=logging.DEBUG if args.debug else logging.INFO) for chunk in iter_chunks( args.device, args.rate, args.width, args.channels, args.samples_per_chunk ): write_event(AudioChunk(args.rate, args.width, args.channels, chunk).event()) if __name__ == "__main__": main() ================================================ FILE: programs/mic/sounddevice/bin/sounddevice_list_mics.py ================================================ #!/usr/bin/env python3 import sounddevice def main() -> None: for info in sounddevice.query_devices(): print(info) if __name__ == "__main__": main() ================================================ FILE: programs/mic/sounddevice/bin/sounddevice_raw.py ================================================ #!/usr/bin/env python3 import argparse import logging import sys from pathlib import Path from sounddevice_shared import iter_chunks _FILE = Path(__file__) _DIR = _FILE.parent _LOGGER = logging.getLogger(_FILE.stem) def main() -> None: parser = argparse.ArgumentParser() parser.add_argument("--rate", type=int, required=True, help="Sample rate (hertz)") parser.add_argument("--width", type=int, required=True, help="Sample width (bytes)") parser.add_argument( "--channels", type=int, required=True, help="Sample channel count" ) parser.add_argument( "--samples-per-chunk", type=int, required=True, help="Number of samples to process at a time", ) parser.add_argument("--device", help="Name or index of device to use") # parser.add_argument( "--debug", action="store_true", help="Print DEBUG messages to console" ) args = parser.parse_args() logging.basicConfig(level=logging.DEBUG if args.debug else logging.INFO) for chunk in iter_chunks( args.device, args.rate, args.width, args.channels, args.samples_per_chunk ): sys.stdout.buffer.write(chunk) if __name__ == "__main__": main() ================================================ FILE: programs/mic/sounddevice/bin/sounddevice_shared.py ================================================ import logging from pathlib import Path from typing import Iterable, Optional, Union import sounddevice _FILE = Path(__file__) _DIR = _FILE.parent _LOGGER = logging.getLogger(_FILE.stem) def iter_chunks( device: Optional[Union[int, str]], rate: int, width: int, channels: int, samples_per_chunk: int, ) -> Iterable[bytes]: """Open input stream and yield audio chunks.""" try: if isinstance(device, str): try: device = int(device) except ValueError: for i, info in enumerate(sounddevice.query_devices()): if device == info["name"]: device = i break assert device is not None, f"No device named: {device}" _LOGGER.debug("Device: %s", device) with sounddevice.RawInputStream( samplerate=rate, blocksize=samples_per_chunk, device=device, channels=channels, dtype="int16", ) as stream: chunk, _overflowed = stream.read(samples_per_chunk) while chunk: yield chunk chunk, _overflowed = stream.read(samples_per_chunk) except KeyboardInterrupt: pass ================================================ FILE: programs/mic/sounddevice/requirements.txt ================================================ sounddevice ================================================ FILE: programs/mic/sounddevice/script/events ================================================ #!/usr/bin/env bash set -eo pipefail # Directory of *this* script this_dir="$( cd "$( dirname "$0" )" && pwd )" # Base directory of repo base_dir="$(realpath "${this_dir}/..")" # Path to virtual environment : "${venv:=${base_dir}/.venv}" if [ -d "${venv}" ]; then source "${venv}/bin/activate" fi python3 "${base_dir}/bin/sounddevice_events.py" "$@" ================================================ FILE: programs/mic/sounddevice/script/list_mics ================================================ #!/usr/bin/env bash set -eo pipefail # Directory of *this* script this_dir="$( cd "$( dirname "$0" )" && pwd )" # Base directory of repo base_dir="$(realpath "${this_dir}/..")" # Path to virtual environment : "${venv:=${base_dir}/.venv}" if [ -d "${venv}" ]; then source "${venv}/bin/activate" fi python3 -m sounddevice ================================================ FILE: programs/mic/sounddevice/script/raw ================================================ #!/usr/bin/env bash set -eo pipefail # Directory of *this* script this_dir="$( cd "$( dirname "$0" )" && pwd )" # Base directory of repo base_dir="$(realpath "${this_dir}/..")" # Path to virtual environment : "${venv:=${base_dir}/.venv}" if [ -d "${venv}" ]; then source "${venv}/bin/activate" fi python3 "${base_dir}/bin/sounddevice_raw.py" "$@" ================================================ FILE: programs/mic/sounddevice/script/setup ================================================ #!/usr/bin/env bash set -eo pipefail # Directory of *this* script this_dir="$( cd "$( dirname "$0" )" && pwd )" # Base directory of repo base_dir="$(realpath "${this_dir}/..")" # Path to virtual environment : "${venv:=${base_dir}/.venv}" # Python binary to use : "${PYTHON=python3}" python_version="$(${PYTHON} --version)" if [ ! -d "${venv}" ]; then # Create virtual environment echo "Creating virtual environment at ${venv} (${python_version})" rm -rf "${venv}" "${PYTHON}" -m venv "${venv}" source "${venv}/bin/activate" pip3 install --upgrade pip pip3 install --upgrade wheel setuptools else source "${venv}/bin/activate" fi # Install Python dependencies echo 'Installing Python dependencies' pip3 install -r "${base_dir}/requirements.txt" # Install rhasspy3 rhasspy3_dir="${base_dir}/../../../.." pip3 install -e "${rhasspy3_dir}" # ----------------------------------------------------------------------------- echo "OK" ================================================ FILE: programs/mic/udp_raw/bin/udp_raw.py ================================================ #!/usr/bin/env python3 import argparse import socketserver from functools import partial from rhasspy3.audio import ( DEFAULT_IN_CHANNELS, DEFAULT_IN_RATE, DEFAULT_IN_WIDTH, AudioChunk, ) from rhasspy3.event import write_event def main() -> None: parser = argparse.ArgumentParser() parser.add_argument("--port", type=int, required=True) parser.add_argument("--host", default="0.0.0.0") # parser.add_argument( "--rate", type=int, default=DEFAULT_IN_RATE, help="Sample rate (hertz)" ) parser.add_argument( "--width", type=int, default=DEFAULT_IN_WIDTH, help="Sample width (bytes)" ) parser.add_argument( "--channels", type=int, default=DEFAULT_IN_CHANNELS, help="Sample channel count", ) args = parser.parse_args() with socketserver.UDPServer( (args.host, args.port), partial(MicUDPHandler, args.rate, args.width, args.channels), ) as server: server.serve_forever() class MicUDPHandler(socketserver.BaseRequestHandler): def __init__(self, rate: int, width: int, channels: int, *args, **kwargs): self.rate = rate self.width = width self.channels = channels self.state = None super().__init__(*args, **kwargs) def handle(self): audio_bytes = self.request[0] write_event( AudioChunk( rate=self.rate, width=self.width, channels=self.channels, audio=audio_bytes, ).event() ) if __name__ == "__main__": main() ================================================ FILE: programs/remote/websocket/bin/stream2stream.py ================================================ #!/usr/bin/env python3 import argparse import asyncio import json import logging import sys from pathlib import Path from typing import Optional from websockets import connect from websockets.exceptions import ConnectionClosedOK from rhasspy3.audio import AudioChunk, AudioStart, AudioStop from rhasspy3.event import Event, read_event, write_event _FILE = Path(__file__) _DIR = _FILE.parent _LOGGER = logging.getLogger(_FILE.stem) async def main() -> None: parser = argparse.ArgumentParser() parser.add_argument("uri") args = parser.parse_args() async with connect(args.uri) as websocket: recv_task = asyncio.create_task(websocket.recv()) pending = {recv_task} while True: mic_event = read_event() if mic_event is None: break if not AudioChunk.is_type(mic_event.type): continue mic_chunk = AudioChunk.from_event(mic_event) send_task = asyncio.create_task(websocket.send(mic_chunk.audio)) pending.add(send_task) done, pending = await asyncio.wait( pending, return_when=asyncio.FIRST_COMPLETED ) if recv_task in done: for task in pending: task.cancel() break if send_task not in done: await send_task try: while True: start: Optional[AudioStart] = None data = await websocket.recv() try: event = Event.from_dict(json.loads(data)) if AudioStart.is_type(event.type): start = AudioStart.from_event(event) break except Exception: continue assert start is not None while True: data = await websocket.recv() if isinstance(data, bytes): write_event( AudioChunk( start.rate, start.width, start.channels, data, ).event() ) else: try: event = Event.from_dict(json.loads(data)) if AudioStop.is_type(event.type): break except Exception: pass except ConnectionClosedOK: pass finally: write_event(AudioStop().event()) async def play(websocket, done_event: asyncio.Event): try: while True: audio_bytes = await websocket.recv() if isinstance(audio_bytes, bytes): done_event.set() sys.stdout.buffer.write(audio_bytes) sys.stdout.buffer.flush() except ConnectionClosedOK: pass if __name__ == "__main__": try: asyncio.run(main()) except KeyboardInterrupt: pass ================================================ FILE: programs/remote/websocket/requirements.txt ================================================ websockets ================================================ FILE: programs/remote/websocket/script/run ================================================ #!/usr/bin/env bash set -eo pipefail # Directory of *this* script this_dir="$( cd "$( dirname "$0" )" && pwd )" # Base directory of repo base_dir="$(realpath "${this_dir}/..")" # Path to virtual environment : "${venv:=${base_dir}/.venv}" if [ -d "${venv}" ]; then source "${venv}/bin/activate" fi python3 "${base_dir}/bin/stream2stream.py" "$@" ================================================ FILE: programs/remote/websocket/script/setup ================================================ #!/usr/bin/env bash set -eo pipefail # Directory of *this* script this_dir="$( cd "$( dirname "$0" )" && pwd )" # Base directory of repo base_dir="$(realpath "${this_dir}/..")" # Path to virtual environment : "${venv:=${base_dir}/.venv}" # Python binary to use : "${PYTHON=python3}" python_version="$(${PYTHON} --version)" if [ ! -d "${venv}" ]; then # Create virtual environment echo "Creating virtual environment at ${venv} (${python_version})" rm -rf "${venv}" "${PYTHON}" -m venv "${venv}" source "${venv}/bin/activate" pip3 install --upgrade pip pip3 install --upgrade wheel setuptools else source "${venv}/bin/activate" fi # Install Python dependencies echo 'Installing Python dependencies' pip3 install -r "${base_dir}/requirements.txt" # Install rhasspy3 rhasspy3_dir="${base_dir}/../../../.." pip3 install -e "${rhasspy3_dir}" # ----------------------------------------------------------------------------- echo "OK" ================================================ FILE: programs/snd/udp_raw/bin/udp_raw.py ================================================ #!/usr/bin/env python3 import argparse import socket from rhasspy3.audio import ( DEFAULT_OUT_CHANNELS, DEFAULT_OUT_RATE, DEFAULT_OUT_WIDTH, AudioChunk, AudioChunkConverter, AudioStop, ) from rhasspy3.event import read_event, write_event from rhasspy3.snd import Played def main() -> None: parser = argparse.ArgumentParser() parser.add_argument("--port", type=int, required=True) parser.add_argument("--host", required=True) # parser.add_argument( "--rate", type=int, default=DEFAULT_OUT_RATE, help="Sample rate (hertz)" ) parser.add_argument( "--width", type=int, default=DEFAULT_OUT_WIDTH, help="Sample width (bytes)" ) parser.add_argument( "--channels", type=int, default=DEFAULT_OUT_CHANNELS, help="Sample channel count", ) # args = parser.parse_args() converter = AudioChunkConverter(args.rate, args.width, args.channels) sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) while True: event = read_event() if event is None: break if AudioChunk.is_type(event.type): chunk = AudioChunk.from_event(event) chunk = converter.convert(chunk) sock.sendto(chunk.audio, (args.host, args.port)) elif AudioStop.is_type(event.type): break write_event(Played().event()) if __name__ == "__main__": main() ================================================ FILE: programs/tts/coqui-tts/README.md ================================================ # Coqui-TTS Text to speech service for Rhasspy based on [Coqui-TTS](https://tts.readthedocs.io). ================================================ FILE: programs/tts/coqui-tts/requirements.txt ================================================ tts ================================================ FILE: programs/tts/coqui-tts/script/list_models ================================================ #!/usr/bin/env bash set -eo pipefail # Directory of *this* script this_dir="$( cd "$( dirname "$0" )" && pwd )" # Base directory of repo base_dir="$(realpath "${this_dir}/..")" # Path to virtual environment : "${venv:=${base_dir}/.venv}" if [ -d "${venv}" ]; then source "${venv}/bin/activate" fi tts --list_models "$@" ================================================ FILE: programs/tts/coqui-tts/script/server ================================================ #!/usr/bin/env bash set -eo pipefail # Directory of *this* script this_dir="$( cd "$( dirname "$0" )" && pwd )" # Base directory of repo base_dir="$(realpath "${this_dir}/..")" # Path to virtual environment : "${venv:=${base_dir}/.venv}" if [ -d "${venv}" ]; then source "${venv}/bin/activate" fi tts-server --model_name 'tts_models/en/ljspeech/vits' "$@" ================================================ FILE: programs/tts/coqui-tts/script/setup ================================================ #!/usr/bin/env bash set -eo pipefail # Directory of *this* script this_dir="$( cd "$( dirname "$0" )" && pwd )" # Base directory of repo base_dir="$(realpath "${this_dir}/..")" # Path to virtual environment : "${venv:=${base_dir}/.venv}" # Python binary to use : "${PYTHON=python3}" python_version="$(${PYTHON} --version)" if [ ! -d "${venv}" ]; then # Create virtual environment echo "Creating virtual environment at ${venv} (${python_version})" rm -rf "${venv}" "${PYTHON}" -m venv "${venv}" source "${venv}/bin/activate" pip3 install --upgrade pip pip3 install --upgrade wheel setuptools else source "${venv}/bin/activate" fi # Install Python dependencies echo 'Installing Python dependencies' pip3 install -r "${base_dir}/requirements.txt" # ----------------------------------------------------------------------------- echo "OK" ================================================ FILE: programs/tts/flite/script/download.py ================================================ #!/usr/bin/env python3 import argparse import logging import tarfile from pathlib import Path from urllib.request import urlopen _DIR = Path(__file__).parent _LOGGER = logging.getLogger("download") def main() -> None: parser = argparse.ArgumentParser() parser.add_argument( "language", nargs="+", choices=("en", "indic"), help="Voice language(s) to download", ) parser.add_argument( "--destination", help="Path to destination directory (default: share)" ) parser.add_argument( "--link-format", default="https://github.com/rhasspy/models/releases/download/v1.0/tts_flite-{language}.tar.gz", help="Format string for download URLs", ) args = parser.parse_args() logging.basicConfig(level=logging.INFO) if not args.destination: args.destination = _DIR.parent / "share" args.destination.parent.mkdir(parents=True, exist_ok=True) for language in args.language: url = args.link_format.format(language=language) _LOGGER.info("Downloading %s", url) with urlopen(url) as response: with tarfile.open(mode="r|*", fileobj=response) as tar_gz: _LOGGER.info("Extracting to %s", args.destination) tar_gz.extractall(args.destination) if __name__ == "__main__": main() ================================================ FILE: programs/tts/flite/script/setup ================================================ #!/usr/bin/env bash sudo apt-get update sudo apt-get install flite # ----------------------------------------------------------------------------- echo "OK" ================================================ FILE: programs/tts/larynx/README.md ================================================ # Larynx Text to speech service for Rhasspy based on [Larynx](https://github.com/rhasspy/larynx/). ================================================ FILE: programs/tts/larynx/bin/larynx_client.py ================================================ #!/usr/bin/env python3 import argparse import logging import shutil import sys from pathlib import Path from urllib.parse import urlencode from urllib.request import urlopen _FILE = Path(__file__) _DIR = _FILE.parent _LOGGER = logging.getLogger(_FILE.stem) def main(): parser = argparse.ArgumentParser() parser.add_argument( "url", help="URL of API endpoint", ) parser.add_argument("voice", help="VOICE parameter") parser.add_argument( "--debug", action="store_true", help="Print DEBUG messages to console" ) args = parser.parse_args() logging.basicConfig(level=logging.DEBUG if args.debug else logging.INFO) params = {"INPUT_TEXT": sys.stdin.read(), "VOICE": args.voice} url = args.url + "?" + urlencode(params) _LOGGER.debug(url) with urlopen(url) as response: shutil.copyfileobj(response, sys.stdout.buffer) if __name__ == "__main__": main() ================================================ FILE: programs/tts/larynx/requirements.txt ================================================ larynx ================================================ FILE: programs/tts/larynx/script/list_models ================================================ #!/usr/bin/env bash set -eo pipefail # Directory of *this* script this_dir="$( cd "$( dirname "$0" )" && pwd )" # Base directory of repo base_dir="$(realpath "${this_dir}/..")" # Path to virtual environment : "${venv:=${base_dir}/.venv}" if [ -d "${venv}" ]; then source "${venv}/bin/activate" fi larynx --list "$@" ================================================ FILE: programs/tts/larynx/script/server ================================================ #!/usr/bin/env bash set -eo pipefail # Directory of *this* script this_dir="$( cd "$( dirname "$0" )" && pwd )" # Base directory of repo base_dir="$(realpath "${this_dir}/..")" # Path to virtual environment : "${venv:=${base_dir}/.venv}" if [ -d "${venv}" ]; then source "${venv}/bin/activate" fi larynx-server "$@" ================================================ FILE: programs/tts/larynx/script/setup ================================================ #!/usr/bin/env bash set -eo pipefail # Directory of *this* script this_dir="$( cd "$( dirname "$0" )" && pwd )" # Base directory of repo base_dir="$(realpath "${this_dir}/..")" # Path to virtual environment : "${venv:=${base_dir}/.venv}" # Python binary to use : "${PYTHON=python3}" python_version="$(${PYTHON} --version)" if [ ! -d "${venv}" ]; then # Create virtual environment echo "Creating virtual environment at ${venv} (${python_version})" rm -rf "${venv}" "${PYTHON}" -m venv "${venv}" source "${venv}/bin/activate" pip3 install --upgrade pip pip3 install --upgrade wheel setuptools else source "${venv}/bin/activate" fi # Install Python dependencies echo 'Installing Python dependencies' pip3 install -r "${base_dir}/requirements.txt" \ 'jinja2<3.1.0' \ -f 'https://synesthesiam.github.io/prebuilt-apps/' \ -f 'https://download.pytorch.org/whl/cpu/torch_stable.html' # ----------------------------------------------------------------------------- echo "OK" ================================================ FILE: programs/tts/marytts/bin/marytts.py ================================================ #!/usr/bin/env python3 import argparse import logging import shutil import sys from pathlib import Path from urllib.parse import urlencode from urllib.request import urlopen _FILE = Path(__file__) _DIR = _FILE.parent _LOGGER = logging.getLogger(_FILE.stem) def main(): parser = argparse.ArgumentParser() parser.add_argument( "url", help="URL of API endpoint", ) parser.add_argument("voice", help="VOICE parameter") parser.add_argument( "--debug", action="store_true", help="Print DEBUG messages to console" ) args = parser.parse_args() logging.basicConfig(level=logging.DEBUG if args.debug else logging.INFO) params = {"INPUT_TEXT": sys.stdin.read(), "VOICE": args.voice} url = args.url + "?" + urlencode(params) _LOGGER.debug(url) with urlopen(url) as response: shutil.copyfileobj(response, sys.stdout.buffer) if __name__ == "__main__": main() ================================================ FILE: programs/tts/mimic3/README.md ================================================ # Mimic 3 Text to speech service for Rhasspy based on [Mimic 3](https://github.com/mycroftAI/mimic3). ================================================ FILE: programs/tts/mimic3/bin/mimic3_server.py ================================================ #!/usr/bin/env python3 import argparse import json import logging import os import socket import threading from pathlib import Path from mimic3_tts import ( DEFAULT_VOICE, AudioResult, Mimic3Settings, Mimic3TextToSpeechSystem, ) _FILE = Path(__file__) _DIR = _FILE.parent _LOGGER = logging.getLogger(_FILE.stem) def main() -> None: parser = argparse.ArgumentParser() parser.add_argument("voices_dir", help="Path to directory with /") parser.add_argument("--voice", default=DEFAULT_VOICE, help="Name of voice to use") parser.add_argument( "--socketfile", required=True, help="Path to Unix domain socket file" ) parser.add_argument("--debug", action="store_true", help="Log DEBUG messages") args = parser.parse_args() logging.basicConfig(level=logging.DEBUG if args.debug else logging.INFO) # Need to unlink socket if it exists try: os.unlink(args.socketfile) except OSError: pass try: # Create socket server sock = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) sock.bind(args.socketfile) sock.listen() mimic3 = Mimic3TextToSpeechSystem( Mimic3Settings( voices_directories=[args.voices_dir], voices_download_dir=args.voices_dir, ) ) if "#" in args.voice: # Case to handle a multi-speaker voice definition voice, _speaker = args.voice.split("#", maxsplit=1) _LOGGER.debug("Preloading voice: %s", voice) mimic3.preload_voice(voice) else: _LOGGER.debug("Preloading voice: %s", args.voice) mimic3.preload_voice(args.voice) _LOGGER.info("Ready") mimic3.voice = args.voice # Listen for connections while True: try: connection, client_address = sock.accept() _LOGGER.debug("Connection from %s", client_address) # Start new thread for client threading.Thread( target=handle_client, args=(connection, mimic3), daemon=True, ).start() except KeyboardInterrupt: break except Exception: _LOGGER.exception("Error communicating with socket client") finally: os.unlink(args.socketfile) def handle_client(connection: socket.socket, mimic3: Mimic3TextToSpeechSystem) -> None: try: with connection, connection.makefile(mode="rwb") as conn_file: while True: event_info = json.loads(conn_file.readline()) event_type = event_info["type"] if event_type == "synthesize": text = event_info["data"]["text"] _LOGGER.debug("synthesize: text='%s'", text) mimic3.begin_utterance() mimic3.speak_text(text) results = mimic3.end_utterance() is_first_audio = True for result in results: if not isinstance(result, AudioResult): continue data = { "rate": result.sample_rate_hz, "width": result.sample_width_bytes, "channels": result.num_channels, } if is_first_audio: is_first_audio = False conn_file.write( ( json.dumps({"type": "audio-start", "data": data}) + "\n" ).encode() ) conn_file.write( ( json.dumps( { "type": "audio-chunk", "data": data, "payload_length": len(result.audio_bytes), } ) + "\n" ).encode() ) conn_file.write(result.audio_bytes) conn_file.write( ( json.dumps({"type": "audio-stop"}, ensure_ascii=False) + "\n" ).encode() ) break except Exception: _LOGGER.exception("Unexpected error in client thread") # ----------------------------------------------------------------------------- if __name__ == "__main__": main() ================================================ FILE: programs/tts/mimic3/requirements.txt ================================================ mycroft-mimic3-tts[all] ================================================ FILE: programs/tts/mimic3/script/server ================================================ #!/usr/bin/env bash set -eo pipefail # Directory of *this* script this_dir="$( cd "$( dirname "$0" )" && pwd )" # Base directory of repo base_dir="$(realpath "${this_dir}/..")" # Path to virtual environment : "${venv:=${base_dir}/.venv}" if [ -d "${venv}" ]; then source "${venv}/bin/activate" fi socket_dir="${base_dir}/var/run" mkdir -p "${socket_dir}" python3 "${base_dir}/bin/mimic3_server.py" --socketfile "${socket_dir}/mimic3.socket" "$@" ================================================ FILE: programs/tts/mimic3/script/setup ================================================ #!/usr/bin/env bash set -eo pipefail # Directory of *this* script this_dir="$( cd "$( dirname "$0" )" && pwd )" # Base directory of repo base_dir="$(realpath "${this_dir}/..")" # Path to virtual environment : "${venv:=${base_dir}/.venv}" # Python binary to use : "${PYTHON=python3}" python_version="$(${PYTHON} --version)" if [ ! -d "${venv}" ]; then # Create virtual environment echo "Creating virtual environment at ${venv} (${python_version})" rm -rf "${venv}" "${PYTHON}" -m venv "${venv}" source "${venv}/bin/activate" pip3 install --upgrade pip pip3 install --upgrade wheel setuptools else source "${venv}/bin/activate" fi # Install Python dependencies echo 'Installing Python dependencies' pip3 install -r "${base_dir}/requirements.txt" mimic3-download --output-dir "${base_dir}/share" 'apope' # ----------------------------------------------------------------------------- echo "OK" ================================================ FILE: programs/tts/piper/README.md ================================================ # Piper Text to speech service for Rhasspy based on [Piper](https://github.com/rhasspy/piper). ## Installation 1. Copy the contents of this directory to `config/programs/tts/piper/` 2. Run `script/setup` 3. Download a model with `script/download.py` * Example: `script/download.py english` * Models are downloaded to `config/data/tts/piper` directory 4. Test with `bin/piper` * Example `echo 'Welcome to the world of speech synthesis.' | bin/piper --model /path/to/en-us-blizzard_lessac-medium.onnx --output_file welcome.wav` ================================================ FILE: programs/tts/piper/bin/piper_server.py ================================================ #!/usr/bin/env python3 import argparse import json import logging import os import socket import subprocess import tempfile import wave from pathlib import Path _FILE = Path(__file__) _DIR = _FILE.parent _LOGGER = logging.getLogger(_FILE.stem) def main() -> None: parser = argparse.ArgumentParser() parser.add_argument("model", help="Path to model file (.onnx)") parser.add_argument( "--socketfile", required=True, help="Path to Unix domain socket file" ) parser.add_argument( "--auto-punctuation", default=".?!", help="Automatically add punctuation" ) parser.add_argument("--debug", action="store_true", help="Log DEBUG messages") args = parser.parse_args() logging.basicConfig(level=logging.DEBUG if args.debug else logging.INFO) # Need to unlink socket if it exists try: os.unlink(args.socketfile) except OSError: pass try: # Create socket server sock = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) sock.bind(args.socketfile) sock.listen() with tempfile.TemporaryDirectory() as temp_dir: command = [ str(_DIR / "piper"), "--model", str(args.model), "--output_dir", temp_dir, ] with subprocess.Popen( command, stdin=subprocess.PIPE, stdout=subprocess.PIPE, universal_newlines=True, ) as proc: _LOGGER.info("Ready") # Listen for connections while True: try: connection, client_address = sock.accept() _LOGGER.debug("Connection from %s", client_address) handle_connection(connection, proc, args) except KeyboardInterrupt: break except Exception: _LOGGER.exception("Error communicating with socket client") finally: os.unlink(args.socketfile) def handle_connection( connection: socket.socket, proc: subprocess.Popen, args: argparse.Namespace ) -> None: assert proc.stdin is not None assert proc.stdout is not None with connection, connection.makefile(mode="rwb") as conn_file: while True: event_info = json.loads(conn_file.readline()) event_type = event_info["type"] if event_type != "synthesize": continue raw_text = event_info["data"]["text"] text = raw_text.strip() if args.auto_punctuation and text: has_punctuation = False for punc_char in args.auto_punctuation: if text[-1] == punc_char: has_punctuation = True break if not has_punctuation: text = text + args.auto_punctuation[0] _LOGGER.debug("synthesize: raw_text=%s, text='%s'", raw_text, text) # Text in, file path out print(text.strip(), file=proc.stdin, flush=True) output_path = proc.stdout.readline().strip() _LOGGER.debug(output_path) wav_file: wave.Wave_read = wave.open(output_path, "rb") with wav_file: data = { "rate": wav_file.getframerate(), "width": wav_file.getsampwidth(), "channels": wav_file.getnchannels(), } conn_file.write( ( json.dumps( {"type": "audio-start", "data": data}, ensure_ascii=False ) + "\n" ).encode() ) # Audio audio_bytes = wav_file.readframes(wav_file.getnframes()) conn_file.write( ( json.dumps( { "type": "audio-chunk", "data": data, "payload_length": len(audio_bytes), }, ensure_ascii=False, ) + "\n" ).encode() ) conn_file.write(audio_bytes) conn_file.write( (json.dumps({"type": "audio-stop"}, ensure_ascii=False) + "\n").encode() ) os.unlink(output_path) break # ----------------------------------------------------------------------------- if __name__ == "__main__": main() ================================================ FILE: programs/tts/piper/script/download.py ================================================ #!/usr/bin/env python3 import argparse import logging import tarfile from pathlib import Path from urllib.request import urlopen _DIR = Path(__file__).parent _LOGGER = logging.getLogger("setup") _VOICE_NAMES = [ "ca-upc_ona-x-low", "ca-upc_pau-x-low", "da-nst_talesyntese-medium", "de-eva_k-x-low", "de-thorsten-low", "en-gb-alan-low", "en-gb-southern_english_female-low", "en-us-amy-low", "en-us-kathleen-low", "en-us-lessac-low", "en-us-lessac-medium", "en-us-libritts-high", "en-us-ryan-high", "en-us-ryan-low", "en-us-ryan-medium", "es-carlfm-x-low", "fi-harri-low", "fr-siwis-low", "fr-siwis-medium", "it-riccardo_fasol-x-low", "kk-iseke-x-low", "kk-issai-high", "kk-raya-x-low", "ne-google-medium", "ne-google-x-low", "nl-mls_7432-low", "nl-nathalie-x-low", "nl-rdh-medium", "nl-rdh-x-low", "no-talesyntese-medium", "pl-mls_6892-low", "pt-br-edresson-low", "uk-lada-x-low", "vi-25hours-single-low", "vi-vivos-x-low", "zh-cn-huayan-x-low", ] _VOICES = { "catalan": "ca", "ca": "ca-upc_ona-x-low", # "danish": "da", "da": "da-nst_talesyntese-medium", # "german": "de", "de": "de-thorsten-low", # "english": "en", "en": "en-us", "en-us": "en-us-lessac-low", "en-gb": "en-gb-alan-low", # "spanish": "es", "es": "es-carlfm-x-low", # "french": "fr", "fr": "fr-siwis-low", # "italian": "it", "it": "it-riccardo_fasol-x-low", # "kazakh": "kk", "kk": "kk-iseke-x-low", # "nepali": "ne", "ne": "ne-google-x-low", # "dutch": "nl", "nl": "nl-rdh-x-low", # "norwegian": "no", "no": "no-talesyntese-medium", # "polish": "pl", "pl": "pl-mls_6892-low", # "portuguese": "pt", "pt": "pt-br", "pt-br": "pt-br-edresson-low", # "ukrainian": "uk", "uk": "uk-lada-x-low", # "vietnamese": "vi", "vi": "vi-25hours-single-low", # "chinese": "zh", "zh": "zh-cn", "zh-cn": "zh-cn-huayan-x-low", } def main() -> None: parser = argparse.ArgumentParser() parser.add_argument( "name", nargs="+", choices=sorted(_VOICES.keys()), help="Voice language(s) to download", ) parser.add_argument( "--destination", help="Path to destination directory (default: share)" ) parser.add_argument( "--link-format", default="https://github.com/rhasspy/piper/releases/download/v0.0.2/voice-{name}.tar.gz", help="Format string for download URLs", ) parser.add_argument( "--dry-run", action="store_true", help="Don't actually download" ) args = parser.parse_args() logging.basicConfig(level=logging.INFO) if args.destination: args.destination = Path(args.destination) else: # Assume we're in programs/tts/piper/script data_dir = _DIR.parent.parent.parent.parent / "data" args.destination = data_dir / "tts" / "piper" args.destination.parent.mkdir(parents=True, exist_ok=True) for voice_name in _VOICE_NAMES: _VOICES[voice_name] = voice_name for name in args.name: resolved_name = _VOICES[name] while resolved_name != name: name = resolved_name resolved_name = _VOICES[name] url = args.link_format.format(name=name) _LOGGER.info("Downloading %s", url) if args.dry_run: return with urlopen(url) as response: with tarfile.open(mode="r|*", fileobj=response) as tar_gz: _LOGGER.info("Extracting to %s", args.destination) tar_gz.extractall(args.destination) if __name__ == "__main__": main() ================================================ FILE: programs/tts/piper/script/server ================================================ #!/usr/bin/env bash set -eo pipefail # Directory of *this* script this_dir="$( cd "$( dirname "$0" )" && pwd )" # Base directory of repo base_dir="$(realpath "${this_dir}/..")" # Path to virtual environment : "${venv:=${base_dir}/.venv}" if [ -d "${venv}" ]; then source "${venv}/bin/activate" fi socket_dir="${base_dir}/var/run" mkdir -p "${socket_dir}" python3 "${base_dir}/bin/piper_server.py" --socketfile "${socket_dir}/piper.socket" "$@" ================================================ FILE: programs/tts/piper/script/setup.py ================================================ #!/usr/bin/env python3 import argparse import logging import platform import shutil import tarfile import tempfile from pathlib import Path from urllib.request import urlopen _DIR = Path(__file__).parent _LOGGER = logging.getLogger("setup") PLATFORMS = {"x86_64": "amd64", "aarch64": "arm64"} def main() -> None: parser = argparse.ArgumentParser() parser.add_argument( "--platform", help="CPU architecture to download (amd64, arm64)", ) parser.add_argument( "--destination", help="Path to destination directory (default: bin)" ) parser.add_argument( "--link-format", default="https://github.com/rhasspy/piper/releases/download/v0.0.2/piper_{platform}.tar.gz", help="Format string for download URLs", ) args = parser.parse_args() logging.basicConfig(level=logging.INFO) if not args.platform: args.platform = platform.machine() args.platform = PLATFORMS.get(args.platform, args.platform) if not args.destination: args.destination = _DIR.parent / "bin" else: args.destination = Path(args.destination) args.destination.mkdir(parents=True, exist_ok=True) url = args.link_format.format(platform=args.platform) _LOGGER.info("Downloading %s", url) with urlopen(url) as response, tempfile.TemporaryDirectory() as temp_dir_str: temp_dir = Path(temp_dir_str) with tarfile.open(mode="r|*", fileobj=response) as tar_gz: _LOGGER.info("Extracting to %s", temp_dir) tar_gz.extractall(temp_dir) # Move piper/ contents piper_dir = temp_dir / "piper" for path in piper_dir.iterdir(): rel_path = path.relative_to(piper_dir) if path.is_dir(): shutil.copytree(path, args.destination / rel_path, symlinks=True) else: shutil.copy(path, args.destination / rel_path, follow_symlinks=False) if __name__ == "__main__": main() ================================================ FILE: programs/vad/energy/bin/energy_speech_prob.py ================================================ #!/usr/bin/env python3 import argparse import audioop import logging import sys from pathlib import Path _FILE = Path(__file__) _DIR = _FILE.parent _LOGGER = logging.getLogger(_FILE.stem) def main() -> None: parser = argparse.ArgumentParser() parser.add_argument( "--threshold", type=float, required=True, help="Energy threshold above which is considered speech", ) parser.add_argument( "--width", type=int, required=True, help="Sample width bytes", ) parser.add_argument( "--samples-per-chunk", required=True, type=int, help="Samples to send to command at a time", ) # parser.add_argument("--debug", action="store_true", help="Log DEBUG messages") args = parser.parse_args() logging.basicConfig(level=logging.DEBUG if args.debug else logging.INFO) bytes_per_chunk = args.samples_per_chunk * args.width try: chunk = sys.stdin.buffer.read(bytes_per_chunk) while chunk: energy = get_debiased_energy(chunk, args.width) speech_probability = 1 if energy > args.threshold else 0 print(speech_probability, flush=True) chunk = sys.stdin.buffer.read(bytes_per_chunk) except KeyboardInterrupt: pass # ----------------------------------------------------------------------------- def get_debiased_energy(audio_data: bytes, width: int) -> float: """Compute RMS of debiased audio.""" # Thanks to the speech_recognition library! # https://github.com/Uberi/speech_recognition/blob/master/speech_recognition/__init__.py energy = -audioop.rms(audio_data, width) energy_bytes = bytes([energy & 0xFF, (energy >> 8) & 0xFF]) debiased_energy = audioop.rms( audioop.add(audio_data, energy_bytes * (len(audio_data) // width), width), width, ) return debiased_energy if __name__ == "__main__": main() ================================================ FILE: programs/vad/silero/README.md ================================================ # Silero VAD Voice activity detection service for Rhasspy based on [silero-vad](https://github.com/snakers4/silero-vad). ================================================ FILE: programs/vad/silero/bin/silero_speech_prob.py ================================================ #!/usr/bin/env python3 import argparse import logging import sys from dataclasses import dataclass from pathlib import Path from typing import Optional, Union import numpy as np import onnxruntime _FILE = Path(__file__) _DIR = _FILE.parent _LOGGER = logging.getLogger(_FILE.stem) def main() -> None: parser = argparse.ArgumentParser() parser.add_argument("model", help="Path to Silero model") parser.add_argument("--samples-per-chunk", type=int, default=512) # parser.add_argument("--debug", action="store_true", help="Log DEBUG messages") args = parser.parse_args() logging.basicConfig(level=logging.DEBUG if args.debug else logging.INFO) bytes_per_chunk = args.samples_per_chunk * 2 # 16-bit detector = SileroDetector(args.model) detector.start() try: chunk = sys.stdin.buffer.read(bytes_per_chunk) while chunk: speech_probability = detector.get_speech_probability(chunk) print(speech_probability, flush=True) chunk = sys.stdin.buffer.read(bytes_per_chunk) except (KeyboardInterrupt, BrokenPipeError): pass # ----------------------------------------------------------------------------- @dataclass class SileroDetector: model: Union[str, Path] _session: Optional[onnxruntime.InferenceSession] = None _h_array: Optional[np.ndarray] = None _c_array: Optional[np.ndarray] = None def start(self): _LOGGER.debug("Loading VAD model: %s", self.model) self._session = onnxruntime.InferenceSession(str(self.model)) self._session.intra_op_num_threads = 1 self._session.inter_op_num_threads = 1 self._h_array = np.zeros((2, 1, 64)).astype("float32") self._c_array = np.zeros((2, 1, 64)).astype("float32") def get_speech_probability(self, chunk: bytes) -> float: assert self._session is not None audio_array = np.frombuffer(chunk, dtype=np.int16) # Add batch dimension audio_array = np.expand_dims(audio_array, 0) ort_inputs = { "input": audio_array.astype(np.float32), "h0": self._h_array, "c0": self._c_array, } ort_outs = self._session.run(None, ort_inputs) out, self._h_array, self._c_array = ort_outs probability = out.squeeze(2)[:, 1].item() return probability def stop(self): self._session = None self._h_array = None self._c_array = None def reset(self): self._h_array.fill(0) self._c_array.fill(0) # ----------------------------------------------------------------------------- if __name__ == "__main__": main() ================================================ FILE: programs/vad/silero/requirements.txt ================================================ onnxruntime numpy ================================================ FILE: programs/vad/silero/script/setup ================================================ #!/usr/bin/env bash set -eo pipefail # Directory of *this* script this_dir="$( cd "$( dirname "$0" )" && pwd )" # Base directory of repo base_dir="$(realpath "${this_dir}/..")" # Path to virtual environment : "${venv:=${base_dir}/.venv}" # Python binary to use : "${PYTHON=python3}" python_version="$(${PYTHON} --version)" if [ ! -d "${venv}" ]; then # Create virtual environment echo "Creating virtual environment at ${venv} (${python_version})" rm -rf "${venv}" "${PYTHON}" -m venv "${venv}" source "${venv}/bin/activate" pip3 install --upgrade pip pip3 install --upgrade wheel setuptools else source "${venv}/bin/activate" fi # Install Python dependencies echo 'Installing Python dependencies' pip3 install -r "${base_dir}/requirements.txt" # Install rhasspy3 rhasspy3_dir="${base_dir}/../../../.." pip3 install -e "${rhasspy3_dir}" # ----------------------------------------------------------------------------- echo "OK" ================================================ FILE: programs/vad/silero/script/speech_prob ================================================ #!/usr/bin/env bash set -eo pipefail # Directory of *this* script this_dir="$( cd "$( dirname "$0" )" && pwd )" # Base directory of repo base_dir="$(realpath "${this_dir}/..")" # Path to virtual environment : "${venv:=${base_dir}/.venv}" if [ -d "${venv}" ]; then source "${venv}/bin/activate" fi python3 "${base_dir}/bin/silero_speech_prob.py" "$@" ================================================ FILE: programs/vad/webrtcvad/README.md ================================================ # webrtcvad Voice activity detection service for Rhasspy based on [webrtcvad](https://pypi.org/project/webrtcvad/). ================================================ FILE: programs/vad/webrtcvad/bin/webrtcvad_speech_prob.py ================================================ #!/usr/bin/env python3 import argparse import logging import sys from pathlib import Path import webrtcvad _FILE = Path(__file__) _DIR = _FILE.parent _LOGGER = logging.getLogger(_FILE.stem) def main() -> None: parser = argparse.ArgumentParser() parser.add_argument( "mode", choices=(0, 1, 2, 3), type=int, help="Aggressiveness in filtering out non-speech", ) parser.add_argument( "--rate", type=int, default=16000, help="Sample rate (hz)", ) parser.add_argument( "--width", type=int, default=2, help="Sample width bytes", ) parser.add_argument("--samples-per-chunk", type=int, default=480) # parser.add_argument("--debug", action="store_true", help="Log DEBUG messages") args = parser.parse_args() logging.basicConfig(level=logging.DEBUG if args.debug else logging.INFO) chunk_ms = 1000 * (args.samples_per_chunk / args.rate) assert chunk_ms in [10, 20, 30], ( "Sample rate and chunk size must make for 10, 20, or 30 ms buffer sizes," + f" assuming mono audio (got {chunk_ms} ms)" ) bytes_per_chunk = args.samples_per_chunk * args.width vad = webrtcvad.Vad() vad.set_mode(args.mode) try: chunk = sys.stdin.buffer.read(bytes_per_chunk) while chunk: speech_probability = 1 if vad.is_speech(chunk, args.rate) else 0 print(speech_probability, flush=True) chunk = sys.stdin.buffer.read(bytes_per_chunk) except KeyboardInterrupt: pass # ----------------------------------------------------------------------------- if __name__ == "__main__": main() ================================================ FILE: programs/vad/webrtcvad/requirements.txt ================================================ webrtcvad ================================================ FILE: programs/vad/webrtcvad/script/setup ================================================ #!/usr/bin/env bash set -eo pipefail # Directory of *this* script this_dir="$( cd "$( dirname "$0" )" && pwd )" # Base directory of repo base_dir="$(realpath "${this_dir}/..")" # Path to virtual environment : "${venv:=${base_dir}/.venv}" # Python binary to use : "${PYTHON=python3}" python_version="$(${PYTHON} --version)" if [ ! -d "${venv}" ]; then # Create virtual environment echo "Creating virtual environment at ${venv} (${python_version})" rm -rf "${venv}" "${PYTHON}" -m venv "${venv}" source "${venv}/bin/activate" pip3 install --upgrade pip pip3 install --upgrade wheel setuptools else source "${venv}/bin/activate" fi # Install Python dependencies echo 'Installing Python dependencies' pip3 install -r "${base_dir}/requirements.txt" # Install rhasspy3 rhasspy3_dir="${base_dir}/../../../.." pip3 install -e "${rhasspy3_dir}" # ----------------------------------------------------------------------------- echo "OK" ================================================ FILE: programs/vad/webrtcvad/script/speech_prob ================================================ #!/usr/bin/env bash set -eo pipefail # Directory of *this* script this_dir="$( cd "$( dirname "$0" )" && pwd )" # Base directory of repo base_dir="$(realpath "${this_dir}/..")" # Path to virtual environment : "${venv:=${base_dir}/.venv}" if [ -d "${venv}" ]; then source "${venv}/bin/activate" fi python3 "${base_dir}/bin/webrtcvad_speech_prob.py" "$@" ================================================ FILE: programs/wake/porcupine1/bin/list_models.py ================================================ #!/usr/bin/env python3 from pathlib import Path import pvporcupine def main() -> None: """Main method.""" for keyword_path in sorted(pvporcupine.pv_keyword_paths("").values()): model_name = Path(keyword_path).name print(model_name) # ----------------------------------------------------------------------------- if __name__ == "__main__": main() ================================================ FILE: programs/wake/porcupine1/bin/porcupine_raw_text.py ================================================ #!/usr/bin/env python3 import argparse import logging import struct import sys from pathlib import Path from porcupine_shared import get_arg_parser, load_porcupine _FILE = Path(__file__) _DIR = _FILE.parent _LOGGER = logging.getLogger(_FILE.stem) # ----------------------------------------------------------------------------- def main() -> None: """Main method.""" parser = get_arg_parser() parser.add_argument("--samples-per-chunk", type=int, default=512) args = parser.parse_args() logging.basicConfig(level=logging.DEBUG if args.debug else logging.INFO) porcupine, names = load_porcupine(args) chunk_format = "h" * porcupine.frame_length bytes_per_chunk = porcupine.frame_length * 2 # Read 16Khz, 16-bit mono PCM from stdin try: chunk = bytes() next_chunk = sys.stdin.buffer.read(bytes_per_chunk) while next_chunk: while len(chunk) >= bytes_per_chunk: unpacked_chunk = struct.unpack_from( chunk_format, chunk[:bytes_per_chunk] ) keyword_index = porcupine.process(unpacked_chunk) if keyword_index >= 0: print(names[keyword_index], flush=True) chunk = chunk[bytes_per_chunk:] next_chunk = sys.stdin.buffer.read(bytes_per_chunk) chunk += next_chunk except KeyboardInterrupt: pass # ----------------------------------------------------------------------------- if __name__ == "__main__": main() ================================================ FILE: programs/wake/porcupine1/bin/porcupine_shared.py ================================================ import argparse from pathlib import Path from typing import List, Tuple import pvporcupine def get_arg_parser() -> argparse.ArgumentParser: """Get shared command-line argument parser.""" parser = argparse.ArgumentParser() parser.add_argument( "--model", required=True, action="append", nargs="+", help="Keyword model settings (path, [sensitivity])", ) parser.add_argument( "--lang_model", help="Path of the language model (.pv file), default is English", ) parser.add_argument( "--debug", action="store_true", help="Print DEBUG messages to console" ) return parser def load_porcupine(args: argparse.Namespace) -> Tuple[pvporcupine.Porcupine, List[str]]: """Loads porcupine keywords. Returns Porcupine object and list of keyword names (in order).""" # Path to embedded keywords keyword_dir = Path(next(iter(pvporcupine.pv_keyword_paths("").values()))).parent names: List[str] = [] keyword_paths: List[Path] = [] sensitivities: List[float] = [] model_path = ( str(Path(args.lang_model).absolute()) if args.lang_model is not None else None ) for model_settings in args.model: keyword_path_str = model_settings[0] keyword_path = Path(keyword_path_str) if not keyword_path.exists(): keyword_path = keyword_dir / keyword_path_str assert keyword_path.exists(), f"Cannot find {keyword_path_str}" keyword_paths.append(keyword_path) names.append(keyword_path.stem) sensitivity = 0.5 if len(model_settings) > 1: sensitivity = float(model_settings[1]) sensitivities.append(sensitivity) porcupine = pvporcupine.create( keyword_paths=[str(keyword_path.absolute()) for keyword_path in keyword_paths], sensitivities=sensitivities, model_path=model_path, ) return porcupine, names ================================================ FILE: programs/wake/porcupine1/bin/porcupine_stream.py ================================================ #!/usr/bin/env python3 import logging import struct from pathlib import Path from porcupine_shared import get_arg_parser, load_porcupine from rhasspy3.audio import AudioChunk, AudioStop from rhasspy3.event import read_event, write_event from rhasspy3.wake import Detection, NotDetected _FILE = Path(__file__) _DIR = _FILE.parent _LOGGER = logging.getLogger(_FILE.stem) # ----------------------------------------------------------------------------- def main() -> None: """Main method.""" parser = get_arg_parser() args = parser.parse_args() logging.basicConfig(level=logging.DEBUG if args.debug else logging.INFO) porcupine, names = load_porcupine(args) chunk_format = "h" * porcupine.frame_length bytes_per_chunk = porcupine.frame_length * 2 # 16-bit width audio_bytes = bytes() is_detected = False try: while True: event = read_event() if event is None: break if AudioStop.is_type(event.type): break if not AudioChunk.is_type(event.type): continue chunk = AudioChunk.from_event(event) audio_bytes += chunk.audio while len(audio_bytes) >= bytes_per_chunk: unpacked_chunk = struct.unpack_from( chunk_format, audio_bytes[:bytes_per_chunk] ) keyword_index = porcupine.process(unpacked_chunk) if keyword_index >= 0: write_event( Detection( name=names[keyword_index], timestamp=chunk.timestamp ).event() ) is_detected = True audio_bytes = audio_bytes[bytes_per_chunk:] if is_detected: write_event(NotDetected().event()) except KeyboardInterrupt: pass # ----------------------------------------------------------------------------- if __name__ == "__main__": main() ================================================ FILE: programs/wake/porcupine1/requirements.txt ================================================ pvporcupine~=1.9.0 ================================================ FILE: programs/wake/porcupine1/script/download.py ================================================ #!/usr/bin/env python3 import argparse import logging import tarfile from pathlib import Path from urllib.request import urlopen _DIR = Path(__file__).parent _LOGGER = logging.getLogger("download") def main() -> None: parser = argparse.ArgumentParser() parser.add_argument( "--destination", help="Path to destination directory (default: data)" ) parser.add_argument( "--url", default="https://github.com/rhasspy/models/releases/download/v1.0/wake_porcupine1-data.tar.gz", help="URL of porcupine1 data", ) args = parser.parse_args() logging.basicConfig(level=logging.INFO) if args.destination: args.destination = Path(args.destination) else: # Assume we're in programs/wake/porcupine1/script data_dir = _DIR.parent.parent.parent.parent / "data" args.destination = data_dir / "wake" / "porcupine1" args.destination.parent.mkdir(parents=True, exist_ok=True) _LOGGER.info("Downloading %s", args.url) with urlopen(args.url) as response: with tarfile.open(mode="r|*", fileobj=response) as tar_gz: _LOGGER.info("Extracting to %s", args.destination) tar_gz.extractall(args.destination) if __name__ == "__main__": main() ================================================ FILE: programs/wake/porcupine1/script/list_models ================================================ #!/usr/bin/env bash set -eo pipefail # Directory of *this* script this_dir="$( cd "$( dirname "$0" )" && pwd )" # Base directory of repo base_dir="$(realpath "${this_dir}/..")" # Path to virtual environment : "${venv:=${base_dir}/.venv}" if [ -d "${venv}" ]; then source "${venv}/bin/activate" fi python3 "${base_dir}/bin/list_models.py" "$@" ================================================ FILE: programs/wake/porcupine1/script/raw2text ================================================ #!/usr/bin/env bash set -eo pipefail # Directory of *this* script this_dir="$( cd "$( dirname "$0" )" && pwd )" # Base directory of repo base_dir="$(realpath "${this_dir}/..")" # Path to virtual environment : "${venv:=${base_dir}/.venv}" if [ -d "${venv}" ]; then source "${venv}/bin/activate" fi python3 "${base_dir}/bin/porcupine_raw_text.py" "$@" ================================================ FILE: programs/wake/porcupine1/script/setup ================================================ #!/usr/bin/env bash set -eo pipefail # Directory of *this* script this_dir="$( cd "$( dirname "$0" )" && pwd )" # Base directory of repo base_dir="$(realpath "${this_dir}/..")" # Path to virtual environment : "${venv:=${base_dir}/.venv}" # Python binary to use : "${PYTHON=python3}" python_version="$(${PYTHON} --version)" if [ ! -d "${venv}" ]; then # Create virtual environment echo "Creating virtual environment at ${venv} (${python_version})" rm -rf "${venv}" "${PYTHON}" -m venv "${venv}" source "${venv}/bin/activate" pip3 install --upgrade pip pip3 install --upgrade wheel setuptools else source "${venv}/bin/activate" fi # Install Python dependencies echo 'Installing Python dependencies' pip3 install -r "${base_dir}/requirements.txt" # ----------------------------------------------------------------------------- echo "OK" ================================================ FILE: programs/wake/precise-lite/bin/precise.py ================================================ #!/usr/bin/env python3 # Copyright 2021 Mycroft AI Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import argparse import logging import os import sys import typing from dataclasses import dataclass from enum import IntEnum from math import floor from pathlib import Path from typing import Any, Optional, Union import numpy as np import tflite_runtime.interpreter as tflite from sonopy import mfcc_spec MAX_WAV_VALUE = 32768 _log = logging.getLogger("mycroft_hotword") def main(): parser = argparse.ArgumentParser() parser.add_argument("model", help="Path to TFLite model") parser.add_argument( "--sensitivity", type=float, default=0.8, help="Model sensitivity (0-1, default: 0.8)", ) parser.add_argument( "--trigger-level", type=int, default=4, help="Number of activations before detection occurs (default: 4)", ) parser.add_argument( "--chunk-size", type=int, default=2048, help="Number of bytes to read at a time from stdin", ) parser.add_argument("--debug", action="store_true", help="Log DEBUG messages") args = parser.parse_args() logging.basicConfig(level=logging.DEBUG if args.debug else logging.INFO) args.model = Path(args.model).absolute() engine = TFLiteHotWordEngine( local_model_file=args.model, sensitivity=args.sensitivity, trigger_level=args.trigger_level, chunk_size=args.chunk_size, ) if os.isatty(sys.stdin.fileno()): print("Reading raw 16-bit 16khz mono audio from stdin", file=sys.stderr) is_first_audio = True try: while True: chunk = sys.stdin.buffer.read(args.chunk_size) if not chunk: break if is_first_audio: _log.info("Receiving audio") is_first_audio = False engine.update(chunk) if engine.found_wake_word(None): print(args.model.name, flush=True) except KeyboardInterrupt: pass # ----------------------------------------------------------------------------- class TFLiteHotWordEngine: def __init__( self, local_model_file: Union[str, Path], sensitivity: float = 0.7, trigger_level: int = 4, chunk_size: int = 2048, ): self.sensitivity = sensitivity self.trigger_level = trigger_level self.chunk_size = chunk_size self.model_path = Path(local_model_file).absolute() self._interpreter: Optional[tflite.Interpreter] = None self._params: Optional[ListenerParams] = None self._input_details: Optional[Any] = None self._output_details: Optional[Any] = None # Rolling window of MFCCs (fixed sized) self._inputs: Optional[np.ndarray] = None # Current MFCC timestep self._inputs_idx: int = 0 # Bytes for one window of audio self._window_bytes: int = 0 # Bytes for one MFCC hop self._hop_bytes: int = 0 # Raw audio self._chunk_buffer = bytes() # Activation level (> trigger_level = wake word found) self._activation: int = 0 # True if wake word was found during last update self._is_found = False # There doesn't seem to be an initialize() method for wake word plugins, # so we'll load the model here. self._load_model() # Last probability self._probability: Optional[float] = None def _load_model(self): _log.debug("Loading model from %s", self.model_path) self._interpreter = tflite.Interpreter(model_path=str(self.model_path)) self._interpreter.allocate_tensors() self._input_details = self._interpreter.get_input_details() self._output_details = self._interpreter.get_output_details() # TODO: Load these from adjacent file self._params = ListenerParams() self._window_bytes = self._params.window_samples * self._params.sample_depth self._hop_bytes = self._params.hop_samples * self._params.sample_depth # Rolling window of MFCCs (fixed sized) self._inputs = np.zeros( (1, self._params.n_features, self._params.n_mfcc), dtype=np.float32 ) def update(self, chunk): self._is_found = False self._chunk_buffer += chunk self._probability = None # Process all available windows while len(self._chunk_buffer) >= self._window_bytes: # Process current audio audio = buffer_to_audio(self._chunk_buffer) # TODO: Implement different MFCC algorithms mfccs = mfcc_spec( audio, self._params.sample_rate, (self._params.window_samples, self._params.hop_samples), num_filt=self._params.n_filt, fft_size=self._params.n_fft, num_coeffs=self._params.n_mfcc, ) num_timesteps = mfccs.shape[0] # Remove processed audio from buffer self._chunk_buffer = self._chunk_buffer[num_timesteps * self._hop_bytes :] # Check if we have a full set of inputs yet inputs_end_idx = self._inputs_idx + num_timesteps if inputs_end_idx > self._inputs.shape[1]: # Full set, need to roll back existing inputs self._inputs = np.roll(self._inputs, -num_timesteps, axis=1) inputs_end_idx = self._inputs.shape[1] self._inputs_idx = inputs_end_idx - num_timesteps # Insert new MFCCs at the end self._inputs[0, self._inputs_idx : inputs_end_idx, :] = mfccs self._inputs_idx += num_timesteps if inputs_end_idx < self._inputs.shape[1]: # Don't have a full set of inputs yet continue # TODO: Add deltas # raw_output self._interpreter.set_tensor(self._input_details[0]["index"], self._inputs) self._interpreter.invoke() raw_output = self._interpreter.get_tensor(self._output_details[0]["index"]) prob = raw_output[0][0] if (prob < 0.0) or (prob > 1.0): # TODO: Handle out of range. # Not seeing these currently, so ignoring. continue self._probability = prob.item() # Decode activated = prob > 1.0 - self.sensitivity triggered = False if activated or (self._activation < 0): # Increase activation self._activation += 1 triggered = self._activation > self.trigger_level if triggered or (activated and (self._activation < 0)): # Push activation down far to avoid an accidental re-activation self._activation = -(8 * 2048) // self.chunk_size elif self._activation > 0: # Decrease activation self._activation -= 1 if triggered: self._is_found = True _log.debug("Triggered") break return self._is_found def found_wake_word(self, frame_data): return self._is_found def reset(self): self._inputs = np.zeros( (1, self._params.n_features, self._params.n_mfcc), dtype=np.float32 ) self._activation = 0 self._is_found = False self._inputs_idx = 0 self._chunk_buffer = bytes() @property def probability(self) -> Optional[float]: return self._probability # ----------------------------------------------------------------------------- class Vectorizer(IntEnum): """ Chooses which function to call to vectorize audio Options: mels: Convert to a compressed Mel spectrogram mfccs: Convert to a MFCC spectrogram speechpy_mfccs: Legacy option to convert to MFCCs using old library """ mels = 1 mfccs = 2 speechpy_mfccs = 3 @dataclass class ListenerParams: """ General pipeline information: - Audio goes through a series of transformations to convert raw audio into machine readable data - These transformations are as follows: - Raw audio -> chopped audio - buffer_t, sample_depth: Input audio loaded and truncated using these value - window_t, hop_t: Linear audio chopped into overlapping frames using a sliding window - Chopped audio -> FFT spectrogram - n_fft, sample_rate: Each audio frame is converted to n_fft frequency intensities - FFT spectrogram -> Mel spectrogram (compressed) - n_filt: Each fft frame is compressed to n_filt summarized mel frequency bins/bands - Mel spectrogram -> MFCC - n_mfcc: Each mel frame is converted to MFCCs and the first n_mfcc values are taken - Disabled by default: Last phase -> Delta vectors - use_delta: If this value is true, the difference between consecutive vectors is concatenated to each frame Parameters for audio pipeline: - buffer_t: Input size of audio. Wakeword must fit within this time - window_t: Time of the window used to calculate a single spectrogram frame - hop_t: Time the window advances forward to calculate the next spectrogram frame - sample_rate: Input audio sample rate - sample_depth: Bytes per input audio sample - n_fft: Size of FFT to generate from audio frame - n_filt: Number of filters to compress FFT to - n_mfcc: Number of MFCC coefficients to use - use_delta: If True, generates "delta vectors" before sending to network - vectorizer: The type of input fed into the network. Options listed in class Vectorizer - threshold_config: Output distribution configuration automatically generated from precise-calc-threshold - threshold_center: Output distribution center automatically generated from precise-calc-threshold """ buffer_t: float = 1.5 window_t: float = 0.1 hop_t: float = 0.05 sample_rate: int = 16000 sample_depth: int = 2 n_fft: int = 512 n_filt: int = 20 n_mfcc: int = 13 use_delta: bool = False vectorizer: int = Vectorizer.mfccs threshold_config: typing.Tuple[typing.Tuple[int, ...], ...] = ((6, 4),) threshold_center: float = 0.2 @property def buffer_samples(self): """buffer_t converted to samples, truncating partial frames""" samples = int(self.sample_rate * self.buffer_t + 0.5) return self.hop_samples * (samples // self.hop_samples) @property def n_features(self): """Number of timesteps in one input to the network""" return 1 + int( floor((self.buffer_samples - self.window_samples) / self.hop_samples) ) @property def window_samples(self): """window_t converted to samples""" return int(self.sample_rate * self.window_t + 0.5) @property def hop_samples(self): """hop_t converted to samples""" return int(self.sample_rate * self.hop_t + 0.5) @property def max_samples(self): """The input size converted to audio samples""" return int(self.buffer_t * self.sample_rate) @property def feature_size(self): """The size of an input vector generated with these parameters""" num_features = { Vectorizer.mfccs: self.n_mfcc, Vectorizer.mels: self.n_filt, Vectorizer.speechpy_mfccs: self.n_mfcc, }[self.vectorizer] if self.use_delta: num_features *= 2 return num_features def chunk_audio( audio: np.ndarray, chunk_size: int ) -> typing.Generator[np.ndarray, None, None]: for i in range(chunk_size, len(audio), chunk_size): yield audio[i - chunk_size : i] def buffer_to_audio(audio_buffer: bytes) -> np.ndarray: """Convert a raw mono audio byte string to numpy array of floats""" return np.frombuffer(audio_buffer, dtype=" bytes: """Convert a numpy array of floats to raw mono audio""" return (audio * MAX_WAV_VALUE).astype("=2.5.0,<3.0 ================================================ FILE: programs/wake/precise-lite/script/setup ================================================ #!/usr/bin/env bash set -eo pipefail # Directory of *this* script this_dir="$( cd "$( dirname "$0" )" && pwd )" # Base directory of repo base_dir="$(realpath "${this_dir}/..")" # Path to virtual environment : "${venv:=${base_dir}/.venv}" # Python binary to use : "${PYTHON=python3}" python_version="$(${PYTHON} --version)" if [ ! -d "${venv}" ]; then # Create virtual environment echo "Creating virtual environment at ${venv} (${python_version})" rm -rf "${venv}" "${PYTHON}" -m venv "${venv}" source "${venv}/bin/activate" pip3 install --upgrade pip pip3 install --upgrade wheel setuptools else source "${venv}/bin/activate" fi # Install Python dependencies echo 'Installing Python dependencies' pip3 install -r "${base_dir}/requirements.txt" # ----------------------------------------------------------------------------- echo "OK" ================================================ FILE: programs/wake/snowboy/bin/snowboy_raw_text.py ================================================ #!/usr/bin/env python3 import argparse import logging import sys from pathlib import Path from typing import Dict from snowboy import snowboydecoder, snowboydetect _LOGGER = logging.getLogger("snowboy_raw_text") # ----------------------------------------------------------------------------- def main() -> None: """Main method.""" parser = argparse.ArgumentParser() parser.add_argument( "--model", required=True, action="append", nargs="+", help="Snowboy model settings (path, [sensitivity], [audio_gain], [apply_frontend])", ) parser.add_argument("--samples-per-chunk", type=int, default=1024) args = parser.parse_args() # logging.basicConfig wouldn't work if a handler already existed. # snowboy must mess with logging, so this resets it. logging.getLogger().handlers = [] logging.basicConfig(level=logging.INFO) # Load model settings detectors: Dict[str, snowboydetect.SnowboyDetect] = {} for model_settings in args.model: model_path = Path(model_settings[0]) sensitivity = "0.5" if len(model_settings) > 1: sensitivity = model_settings[1] audio_gain = 1.0 if len(model_settings) > 2: audio_gain = float(model_settings[2]) apply_frontend = False if len(model_settings) > 3: apply_frontend = model_settings[3].strip().lower() == "true" detector = snowboydetect.SnowboyDetect( snowboydecoder.RESOURCE_FILE.encode(), str(model_path).encode() ) detector.SetSensitivity(sensitivity.encode()) detector.SetAudioGain(audio_gain) detector.ApplyFrontend(apply_frontend) detectors[model_path.stem] = detector # Read 16Khz, 16-bit mono PCM from stdin bytes_per_chunk = args.samples_per_chunk * 2 try: chunk = bytes() next_chunk = sys.stdin.buffer.read(bytes_per_chunk) while next_chunk: while len(chunk) >= bytes_per_chunk: for name, detector in detectors.items(): # Return is: # -2 silence # -1 error # 0 voice # n index n-1 result_index = detector.RunDetection(chunk[:bytes_per_chunk]) if result_index > 0: # Detection print(name, flush=True) chunk = chunk[bytes_per_chunk:] next_chunk = sys.stdin.buffer.read(args.samples_per_chunk) chunk += next_chunk except KeyboardInterrupt: pass # ----------------------------------------------------------------------------- if __name__ == "__main__": main() ================================================ FILE: programs/wake/snowboy/requirements.txt ================================================ snowboy @ https://github.com/Kitt-AI/snowboy/archive/v1.3.0.tar.gz ================================================ FILE: programs/wake/snowboy/script/setup ================================================ #!/usr/bin/env bash set -eo pipefail # Directory of *this* script this_dir="$( cd "$( dirname "$0" )" && pwd )" # Base directory of repo base_dir="$(realpath "${this_dir}/..")" # Path to virtual environment : "${venv:=${base_dir}/.venv}" # Python binary to use : "${PYTHON=python3}" python_version="$(${PYTHON} --version)" if [ ! -d "${venv}" ]; then # Create virtual environment echo "Creating virtual environment at ${venv} (${python_version})" rm -rf "${venv}" "${PYTHON}" -m venv "${venv}" source "${venv}/bin/activate" pip3 install --upgrade pip pip3 install --upgrade wheel setuptools else source "${venv}/bin/activate" fi # Install Python dependencies echo 'Installing Python dependencies' pip3 install -r "${base_dir}/requirements.txt" # ----------------------------------------------------------------------------- echo "OK" ================================================ FILE: pylintrc ================================================ [MESSAGES CONTROL] disable= format, abstract-method, cyclic-import, duplicate-code, global-statement, import-outside-toplevel, inconsistent-return-statements, locally-disabled, not-context-manager, too-few-public-methods, too-many-arguments, too-many-branches, too-many-instance-attributes, too-many-lines, too-many-locals, too-many-public-methods, too-many-return-statements, too-many-statements, too-many-boolean-expressions, unnecessary-pass, unused-argument, broad-except, too-many-nested-blocks, invalid-name, unused-import, fixme, useless-super-delegation, missing-module-docstring, missing-class-docstring, missing-function-docstring, import-error, consider-using-with [FORMAT] expected-line-ending-format=LF ================================================ FILE: requirements_dev.txt ================================================ black==22.12.0 flake8==6.0.0 isort==5.11.3 mypy==0.991 pylint==2.15.9 pytest==7.2.0 ================================================ FILE: requirements_http_api.txt ================================================ quart Quart-CORS hypercorn ================================================ FILE: rhasspy3/VERSION ================================================ 0.0.1 ================================================ FILE: rhasspy3/__init__.py ================================================ ================================================ FILE: rhasspy3/asr.py ================================================ """Speech to text.""" import asyncio import logging import wave from dataclasses import dataclass from typing import IO, AsyncIterable, Optional, Union from .audio import AudioChunk, AudioStart, AudioStop, wav_to_chunks from .config import PipelineProgramConfig from .core import Rhasspy from .event import Event, Eventable, async_read_event, async_write_event from .program import create_process from .vad import DOMAIN as VAD_DOMAIN from .vad import VoiceStarted, VoiceStopped DOMAIN = "asr" _TRANSCRIPT_TYPE = "transcript" _LOGGER = logging.getLogger(__name__) @dataclass class Transcript(Eventable): text: str @staticmethod def is_type(event_type: str) -> bool: return event_type == _TRANSCRIPT_TYPE def event(self) -> Event: return Event(type=_TRANSCRIPT_TYPE, data={"text": self.text}) @staticmethod def from_event(event: Event) -> "Transcript": assert event.data is not None return Transcript(text=event.data["text"]) async def transcribe( rhasspy: Rhasspy, program: Union[str, PipelineProgramConfig], wav_in: IO[bytes], samples_per_chunk: int, ) -> Optional[Transcript]: transcript: Optional[Transcript] = None wav_file: wave.Wave_read = wave.open(wav_in, "rb") with wav_file: rate = wav_file.getframerate() width = wav_file.getsampwidth() channels = wav_file.getnchannels() async with (await create_process(rhasspy, DOMAIN, program)) as asr_proc: assert asr_proc.stdin is not None assert asr_proc.stdout is not None timestamp = 0 await async_write_event( AudioStart(rate, width, channels, timestamp=timestamp).event(), asr_proc.stdin, ) is_first_chunk = True for chunk in wav_to_chunks(wav_file, samples_per_chunk=samples_per_chunk): if is_first_chunk: is_first_chunk = False _LOGGER.debug("transcribe: processing audio") await async_write_event(chunk.event(), asr_proc.stdin) if chunk.timestamp is not None: timestamp = chunk.timestamp else: timestamp += chunk.milliseconds await async_write_event( AudioStop(timestamp=timestamp).event(), asr_proc.stdin ) _LOGGER.debug("transcribe: audio finished") while True: event = await async_read_event(asr_proc.stdout) if event is None: break if Transcript.is_type(event.type): transcript = Transcript.from_event(event) _LOGGER.debug("transcribe: %s", transcript) break return transcript async def transcribe_stream( rhasspy: Rhasspy, asr_program: Union[str, PipelineProgramConfig], vad_program: Union[str, PipelineProgramConfig], audio_stream: AsyncIterable[bytes], rate: int, width: int, channels: int, ) -> Optional[Transcript]: transcript: Optional[Transcript] = None async with (await create_process(rhasspy, DOMAIN, asr_program)) as asr_proc, ( await create_process(rhasspy, VAD_DOMAIN, vad_program) ) as vad_proc: assert asr_proc.stdin is not None assert asr_proc.stdout is not None assert vad_proc.stdin is not None assert vad_proc.stdout is not None timestamp = 0 audio_start_event = AudioStart( rate, width, channels, timestamp=timestamp ).event() await asyncio.gather( async_write_event( audio_start_event, asr_proc.stdin, ), async_write_event( audio_start_event, vad_proc.stdin, ), ) async def next_chunk(): """Get the next chunk from audio stream.""" async for chunk_bytes in audio_stream: return chunk_bytes is_first_chunk = True audio_task = asyncio.create_task(next_chunk()) vad_task = asyncio.create_task(async_read_event(vad_proc.stdout)) pending = {audio_task, vad_task} while True: done, pending = await asyncio.wait( pending, return_when=asyncio.FIRST_COMPLETED ) if vad_task in done: vad_event = vad_task.result() if vad_event is None: break if VoiceStarted.is_type(vad_event.type): _LOGGER.debug("transcribe: voice started") elif VoiceStopped.is_type(vad_event.type): _LOGGER.debug("transcribe: voice stopped") break vad_task = asyncio.create_task(async_read_event(vad_proc.stdout)) pending.add(vad_task) if audio_task in done: chunk_bytes = audio_task.result() if not chunk_bytes: # End of audio stream break if is_first_chunk: _LOGGER.debug("transcribe: processing audio") is_first_chunk = False chunk = AudioChunk(rate, width, channels, chunk_bytes) chunk_event = chunk.event() await asyncio.gather( async_write_event(chunk_event, asr_proc.stdin), async_write_event(chunk_event, vad_proc.stdin), ) timestamp += chunk.milliseconds audio_task = asyncio.create_task(next_chunk()) pending.add(audio_task) await async_write_event(AudioStop(timestamp=timestamp).event(), asr_proc.stdin) _LOGGER.debug("transcribe: audio finished") while True: event = await async_read_event(asr_proc.stdout) if event is None: break if Transcript.is_type(event.type): transcript = Transcript.from_event(event) _LOGGER.debug("transcribe: %s", transcript) break return transcript ================================================ FILE: rhasspy3/audio.py ================================================ """Audio input/output.""" import audioop import wave from dataclasses import dataclass from typing import Iterable, Optional from .event import Event, Eventable _TYPE = "audio-chunk" _START_TYPE = "audio-start" _STOP_TYPE = "audio-stop" DEFAULT_IN_RATE = 16000 # Hz DEFAULT_OUT_RATE = 22050 # Hz DEFAULT_IN_WIDTH = 2 # bytes DEFAULT_OUT_WIDTH = 2 # bytes DEFAULT_IN_CHANNELS = 1 # mono DEFAULT_OUT_CHANNELS = 1 # mono DEFAULT_SAMPLES_PER_CHUNK = 1024 @dataclass class AudioChunk(Eventable): """Chunk of raw PCM audio.""" rate: int """Hertz""" width: int """Bytes""" channels: int """Mono = 1""" audio: bytes """Raw audio""" timestamp: Optional[int] = None """Milliseconds""" @staticmethod def is_type(event_type: str) -> bool: return event_type == _TYPE def event(self) -> Event: return Event( type=_TYPE, data={ "rate": self.rate, "width": self.width, "channels": self.channels, "timestamp": self.timestamp, }, payload=self.audio, ) @staticmethod def from_event(event: Event) -> "AudioChunk": assert event.data is not None assert event.payload is not None return AudioChunk( rate=event.data["rate"], width=event.data["width"], channels=event.data["channels"], audio=event.payload, timestamp=event.data.get("timestamp"), ) @property def samples(self) -> int: return len(self.audio) // (self.width * self.channels) @property def seconds(self) -> float: return self.samples / self.rate @property def milliseconds(self) -> int: return int(self.seconds * 1_000) @dataclass class AudioStart(Eventable): """Audio stream has started.""" rate: int """Hertz""" width: int """Bytes""" channels: int """Mono = 1""" timestamp: Optional[int] = None """Milliseconds""" @staticmethod def is_type(event_type: str) -> bool: return event_type == _START_TYPE def event(self) -> Event: return Event( type=_START_TYPE, data={ "rate": self.rate, "width": self.width, "channels": self.channels, "timestamp": self.timestamp, }, ) @staticmethod def from_event(event: Event) -> "AudioStart": assert event.data is not None return AudioStart( rate=event.data["rate"], width=event.data["width"], channels=event.data["channels"], timestamp=event.data.get("timestamp"), ) @dataclass class AudioStop(Eventable): """Audio stream has stopped.""" timestamp: Optional[int] = None """Milliseconds""" @staticmethod def is_type(event_type: str) -> bool: return event_type == _STOP_TYPE def event(self) -> Event: return Event( type=_STOP_TYPE, data={"timestamp": self.timestamp}, ) @staticmethod def from_event(event: Event) -> "AudioStop": return AudioStop(timestamp=event.data.get("timestamp")) @dataclass class AudioChunkConverter: """Converts audio chunks using audioop.""" rate: Optional[int] = None width: Optional[int] = None channels: Optional[int] = None _ratecv_state = None def convert(self, chunk: AudioChunk) -> AudioChunk: """Converts sample rate, width, and channels as necessary.""" if ( ((self.rate is None) or (chunk.rate == self.rate)) and ((self.width is None) or (chunk.width == self.width)) and ((self.channels is None) or (chunk.channels == self.channels)) ): return chunk audio_bytes = chunk.audio width = chunk.width if (self.width is not None) and (chunk.width != self.width): # Convert sample width audio_bytes = audioop.lin2lin(audio_bytes, chunk.width, self.width) width = self.width channels = chunk.channels if (self.channels is not None) and (chunk.channels != self.channels): # Convert to mono or stereo if self.channels == 1: audio_bytes = audioop.tomono(audio_bytes, width, 1.0, 1.0) elif self.channels == 2: audio_bytes = audioop.tostereo(audio_bytes, width, 1.0, 1.0) else: raise ValueError(f"Cannot convert to channels: {self.channels}") channels = self.channels rate = chunk.rate if (self.rate is not None) and (chunk.rate != self.rate): # Resample audio_bytes, self._ratecv_state = audioop.ratecv( audio_bytes, width, channels, chunk.rate, self.rate, self._ratecv_state, ) rate = self.rate return AudioChunk(rate, width, channels, audio_bytes, timestamp=chunk.timestamp) def wav_to_chunks( wav_file: wave.Wave_read, samples_per_chunk: int, timestamp: int = 0 ) -> Iterable[AudioChunk]: """Splits WAV file into AudioChunks.""" rate = wav_file.getframerate() width = wav_file.getsampwidth() channels = wav_file.getnchannels() audio_bytes = wav_file.readframes(samples_per_chunk) while audio_bytes: chunk = AudioChunk( rate=rate, width=width, channels=channels, audio=audio_bytes, timestamp=timestamp, ) yield chunk timestamp += chunk.milliseconds audio_bytes = wav_file.readframes(samples_per_chunk) ================================================ FILE: rhasspy3/config.py ================================================ import argparse from dataclasses import dataclass, field from typing import Any, Dict, Optional from .util import merge_dict from .util.dataclasses_json import DataClassJsonMixin from .util.jaml import safe_load @dataclass class CommandConfig(DataClassJsonMixin): command: str shell: bool = False @dataclass class ProgramDownloadConfig(DataClassJsonMixin): description: Optional[str] = None check_file: Optional[str] = None @dataclass class ProgramInstallConfig(CommandConfig): check_file: Optional[str] = None download: Optional[CommandConfig] = None downloads: Optional[Dict[str, ProgramDownloadConfig]] = None @dataclass class ProgramConfig(CommandConfig): adapter: Optional[str] = None template_args: Optional[Dict[str, Any]] = None installed: bool = True install: Optional[ProgramInstallConfig] = None @dataclass class PipelineProgramConfig(DataClassJsonMixin): name: str template_args: Optional[Dict[str, Any]] = None after: Optional[CommandConfig] = None @dataclass class PipelineConfig(DataClassJsonMixin): inherit: Optional[str] = None mic: Optional[PipelineProgramConfig] = None wake: Optional[PipelineProgramConfig] = None vad: Optional[PipelineProgramConfig] = None asr: Optional[PipelineProgramConfig] = None intent: Optional[PipelineProgramConfig] = None handle: Optional[PipelineProgramConfig] = None tts: Optional[PipelineProgramConfig] = None snd: Optional[PipelineProgramConfig] = None @dataclass class SatelliteConfig(DataClassJsonMixin): mic: Optional[PipelineProgramConfig] = None wake: Optional[PipelineProgramConfig] = None remote: Optional[PipelineProgramConfig] = None snd: Optional[PipelineProgramConfig] = None @dataclass class ServerConfig(DataClassJsonMixin): command: str shell: bool = False template_args: Optional[Dict[str, Any]] = None @dataclass class Config(DataClassJsonMixin): programs: Dict[str, Dict[str, ProgramConfig]] """domain -> name -> program""" pipelines: Dict[str, PipelineConfig] = field(default_factory=dict) """name -> pipeline""" satellites: Dict[str, SatelliteConfig] = field(default_factory=dict) """name -> satellite""" servers: Dict[str, Dict[str, ServerConfig]] = field(default_factory=dict) """domain -> name -> server""" def __post_init__(self): # Handle inheritance # TODO: Catch loops pipeline_queue = list(self.pipelines.values()) while pipeline_queue: child_pipeline = pipeline_queue.pop() if child_pipeline.inherit: parent_pipeline = self.pipelines[child_pipeline.inherit] if parent_pipeline.inherit: # Need to process parent first pipeline_queue.append(child_pipeline) continue child_pipeline.mic = child_pipeline.mic or parent_pipeline.mic child_pipeline.wake = child_pipeline.wake or parent_pipeline.wake child_pipeline.vad = child_pipeline.vad or parent_pipeline.vad child_pipeline.asr = child_pipeline.asr or parent_pipeline.asr child_pipeline.intent = child_pipeline.intent or parent_pipeline.intent child_pipeline.handle = child_pipeline.handle or parent_pipeline.handle child_pipeline.tts = child_pipeline.tts or parent_pipeline.tts child_pipeline.snd = child_pipeline.snd or parent_pipeline.snd # Mark as done child_pipeline.inherit = None # ----------------------------------------------------------------------------- if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("config", nargs="+", help="Path to YAML configuration file") args = parser.parse_args() config_dict: Dict[str, Any] = {} for config_path in args.config: with open(config_path, "r", encoding="utf-8") as config_file: merge_dict(config_dict, safe_load(config_file)) config = Config.from_dict(config_dict) print(config) ================================================ FILE: rhasspy3/configuration.yaml ================================================ programs: # ----------- # Audio input # ----------- mic: # apt-get install alsa-utils arecord: command: | arecord -q -D "${device}" -r 16000 -c 1 -f S16_LE -t raw - adapter: | mic_adapter_raw.py --samples-per-chunk 1024 --rate 16000 --width 2 --channels 1 template_args: device: "default" # https://people.csail.mit.edu/hubert/pyaudio/docs/ pyaudio: command: | script/events # https://python-sounddevice.readthedocs.io sounddevice: command: | script/events # apt-get install gstreamer1.0-tools gstreamer1.0-plugins-base gstreamer_udp: command: | gst-launch-1.0 -v udpsrc address=${address} port=${port} ! rawaudioparse use-sink-caps=false format=pcm pcm-format=${format} sample-rate=${rate} num-channels=${channels} ! audioconvert ! audioresample ! volume volume=3.0 ! level ! fdsink fd=1 sync=false template_args: format: s16le rate: 16000 channels: 1 address: "0.0.0.0" port: 5000 adapter: | mic_adapter_raw.py --samples-per-chunk 1024 --rate 16000 --width 2 --channels 1 udp_raw: command: | bin/udp_raw.py --host ${host} --port ${port} template_args: host: 0.0.0.0 port: 5000 # ------------------- # Wake word detection # ------------------- wake: # https://github.com/Picovoice/porcupine # Models: see script/list_models porcupine1: command: | .venv/bin/python3 bin/porcupine_stream.py --model "${model}" template_args: model: "porcupine_linux.ppn" # https://github.com/Kitt-AI/snowboy # Models included in share/ # Custom wake word: https://github.com/rhasspy/snowboy-seasalt snowboy: command: | .venv/bin/python3 bin/snowboy_raw_text.py --model "${model}" adapter: | wake_adapter_raw.py template_args: model: "share/snowboy.umdl" # https://github.com/mycroftAI/mycroft-precise # Model included in share/ precise-lite: command: | .venv/bin/python3 bin/precise.py "${model}" adapter: | wake_adapter_raw.py template_args: model: "share/hey_mycroft.tflite" # TODO: snowman # https://github.com/Thalhammer/snowman/ # ------------------------ # Voice activity detection # ------------------------ vad: # https://github.com/snakers4/silero-vad # Model included in share/ silero: command: | script/speech_prob "${model}" adapter: | vad_adapter_raw.py --rate 16000 --width 2 --channels 1 --samples-per-chunk 512 template_args: model: "share/silero_vad.onnx" # https://pypi.org/project/webrtcvad/ webrtcvad: command: | script/speech_prob ${sensitivity} adapter: | vad_adapter_raw.py --rate 16000 --width 2 --channels 1 --samples-per-chunk 480 template_args: sensitivity: 3 # Uses rms energy threshold. # For testing only. energy: command: | bin/energy_speech_prob.py --threshold ${threshold} --width 2 --samples-per-chunk 1024 adapter: | vad_adapter_raw.py --rate 16000 --width 2 --channels 1 --samples-per-chunk 1024 template_args: threshold: 300 # -------------- # Speech to text # -------------- asr: # https://alphacephei.com/vosk/ # Models: https://alphacephei.com/vosk/models vosk: command: | script/raw2text "${model}" adapter: | asr_adapter_raw2text.py --rate 16000 --width 2 --channels 1 template_args: model: "${data_dir}/vosk-model-small-en-us-0.15" # Run server: asr vosk vosk.client: command: | client_unix_socket.py var/run/vosk.socket # https://stt.readthedocs.io # Models: https://coqui.ai/models/ coqui-stt: command: | script/raw2text "${model}" adapter: | asr_adapter_raw2text.py --rate 16000 --width 2 --channels 1 template_args: model: "${data_dir}/english_v1.0.0-large-vocab" # Run server: asr coqui-stt coqui-stt.client: command: | client_unix_socket.py var/run/coqui-stt.socket # https://github.com/cmusphinx/pocketsphinx # Models: https://github.com/synesthesiam/voice2json-profiles pocketsphinx: command: | script/raw2text "${model}" adapter: | asr_adapter_raw2text.py --rate 16000 --width 2 --channels 1 template_args: model: "${data_dir}/en-us_pocketsphinx-cmu" # Run server: asr pocketsphinx pocketsphinx.client: command: | client_unix_socket.py var/run/pocketsphinx.socket # https://github.com/openai/whisper # Models: tiny.en,tiny,base.en,base,small.en,small,medium.en,medium,large-v1,large-v2,large # Languages: af,am,ar,as,az,ba,be,bg,bn,bo,br,bs,ca,cs,cy,da,de,el,en,es,et, # eu,fa,fi,fo,fr,gl,gu,ha,haw,he,hi,hr,ht,hu,hy,id,is,it,ja,jw,ka,kk,km,kn, # ko,la,lb,ln,lo,lt,lv,mg,mi,mk,ml,mn,mr,ms,mt,my,ne,nl,nn,no,oc,pa,pl,ps, # pt,ro,ru,sa,sd,si,sk,sl,sn,so,sq,sr,su,sv,sw,ta,te,tg,th,tk,tl,tr,tt,uk, # ur,uz,vi,yi,yo,zh whisper: command: | script/wav2text --language ${language} --model-directory "${data_dir}" "${model}" "{wav_file}" adapter: | asr_adapter_wav2text.py template_args: language: "en" model: "tiny.en" # Run server: asr whisper whisper.client: command: | client_unix_socket.py var/run/whisper.socket # https://github.com/ggerganov/whisper.cpp/ # Models: https://huggingface.co/datasets/ggerganov/whisper.cpp whisper-cpp: command: | script/wav2text "${model}" "{wav_file}" adapter: | asr_adapter_wav2text.py template_args: model: "${data_dir}/ggml-tiny.en.bin" # Run server: asr whisper-cpp whisper-cpp.client: command: | client_unix_socket.py var/run/whisper-cpp.socket # https://github.com/guillaumekln/faster-whisper/ # Models: https://github.com/rhasspy/models/releases/tag/v1.0 # (asr_faster-whisper-*) faster-whisper: command: | script/wav2text --language ${language} "${model}" "{wav_file}" adapter: | asr_adapter_wav2text.py template_args: model: "${data_dir}/tiny-int8" language: "en" # Run server: asr faster-whisper faster-whisper.client: command: | client_unix_socket.py var/run/faster-whisper.socket # -------------- # Text to speech # -------------- tts: # https://github.com/rhasspy/piper/ # Models: https://github.com/piper/piper/releases/tag/v0.0.2 piper: command: | bin/piper --model "${model}" --output_file - adapter: | tts_adapter_text2wav.py template_args: model: "${data_dir}/en-us-blizzard_lessac-medium.onnx" install: command: | script/setup.py --destination '${program_dir}/bin' check_file: "${program_dir}/bin/piper" download: command: | script/download.py --destination '${data_dir}' '${model}' downloads: en-us_lessac: description: "U.S. English voice" check_file: "${data_dir}/en-us-blizzard_lessac-medium.onnx" # Run server: tts piper piper.client: command: | client_unix_socket.py var/run/piper.socket # https://github.com/rhasspy/larynx/ # Models: https://rhasspy.github.io/larynx/ larynx: command: | .venv/bin/larynx --voices-dir "${data_dir}" --voice "${voice}" adapter: | tts_adapter_text2wav.py template_args: voice: "en-us" # Run server: tts larynx larynx.client: command: | bin/larynx_client.py ${url} ${voice} template_args: url: "http://localhost:5002/process" voice: "en-us" adapter: | tts_adapter_text2wav.py # https://github.com/espeak-ng/espeak-ng/ # apt-get install espeak-ng espeak-ng: command: | espeak-ng -v "${voice}" --stdin -w "{temp_file}" adapter: | tts_adapter_text2wav.py --temp_file template_args: voice: "en-us" # http://www.festvox.org/flite/ # Models: https://github.com/rhasspy/models/releases/tag/v1.0 # (tts_flite-*) flite: command: | flite -voice "${voice}" -o "{temp_file}" template_args: voice: "cmu_us_slt" adapter: | tts_adapter_text2wav.py --temp_file # http://www.cstr.ed.ac.uk/projects/festival/ # apt-get install festival festival- festival: command: | text2wave -o "{temp_file}" -eval "(voice_${voice})" template_args: voice: "cmu_us_slt_arctic_hts" adapter: | tts_adapter_text2wav.py --temp_file # https://tts.readthedocs.io # Models: see script/list_models coqui-tts: command: | .venv/bin/tts --model_name "${model}" --out_path "{temp_file}" --text "{text}" adapter: | tts_adapter_text2wav.py --temp_file --text template_args: model: "tts_models/en/ljspeech/vits" speaker_id: "" coqui-tts.client: command: | tts_adapter_http.py "${url}" --param speaker_id "${speaker_id}" template_args: url: "http://localhost:5002/api/tts" speaker_id: "" # http://mary.dfki.de/ # Models: https://github.com/synesthesiam/opentts/releases/tag/v2.1 # (marytts-voices.tar.gz) marytts: command: | bin/marytts.py "${url}" "${voice}" template_args: url: "http://localhost:59125/process" voice: "cmu-slt-hsmm" adapter: | tts_adapter_text2wav.py # https://github.com/mycroftAI/mimic3 # Models: https://mycroftai.github.io/mimic3-voices/ mimic3: command: | .venv/bin/mimic3 --voices-dir "${data_dir}" --voice "${voice}" --stdout adapter: | tts_adapter_text2wav.py template_args: voice: "apope" # Run server: tts mimic3 mimic3.client: command: | client_unix_socket.py var/run/mimic3.socket # ------------------ # Intent recognition # ------------------ intent: # Simple regex matching regex: command: | bin/regex.py -i TurnOn "turn on (the )?(?P.+)" # TODO: fsticuffs # https://github.com/rhasspy/rhasspy-nlu # TODO: hassil # https://github.com/home-assistant/hassil # TODO: rapidfuzz # https://github.com/rhasspy/rhasspy-fuzzywuzzy # TODO: snips-nlu # https://snips-nlu.readthedocs.io # --------------- # Intent Handling # --------------- handle: # Text only: repeats transcript back repeat: command: | cat shell: true adapter: | handle_adapter_text.py # Text only: send to HA Assist # https://www.home-assistant.io/docs/assist # # 1. Change server url # 2. Put long-lived access token in etc/token home_assistant: command: | bin/converse.py --language "${language}" "${url}" "${token_file}" adapter: | handle_adapter_text.py template_args: url: "http://localhost:8123/api/conversation/process" token_file: "${data_dir}/token" language: "" # Intent only: answer English date/time requests date_time: command: | bin/date_time.py adapter: | handle_adapter_text.py # Intent only: produces canned response to regex intent system test: command: | name="$(jq -r .slots.name)" echo "Turned on ${name}." shell: true adapter: | handle_adapter_json.py # ------------ # Audio output # ------------ snd: # apt-get install alsa-utils aplay: command: | aplay -q -D "${device}" -r 22050 -f S16_LE -c 1 -t raw adapter: | snd_adapter_raw.py --rate 22050 --width 2 --channels 1 template_args: device: "default" udp_raw: command: | bin/udp_raw.py --host "${host}" --port ${port} template_args: host: "127.0.0.1" port: 5001 # ------------------- # Remote base station # ------------------- remote: # Sample tool to communicate with websocket API. # Use rhasspy3/bin/satellite_run.py websocket: command: | script/run "${uri}" template_args: uri: "ws://localhost:13331/pipeline/asr-tts" # ----------------------------------------------------------------------------- servers: asr: vosk: command: | script/server "${model}" template_args: model: "${data_dir}/vosk-model-small-en-us-0.15" coqui-stt: command: | script/server "${model}" template_args: model: "${data_dir}/english_v1.0.0-large-vocab" pocketsphinx: command: | script/server "${model}" template_args: model: "${data_dir}/en-us_pocketsphinx-cmu" whisper: command: | script/server --model-directory "${data_dir}" --language ${language} --device ${device} ${model} template_args: language: "en" model: "tiny.en" device: "cpu" # or cuda whisper-cpp: command: | script/server "${model}" template_args: model: "${data_dir}/ggml-tiny.en.bin" faster-whisper: command: | script/server --language ${language} --device ${device} "${model}" template_args: language: "en" model: "${data_dir}/tiny-int8" device: "cpu" # or cuda tts: mimic3: command: | script/server --voice "${voice}" "${data_dir}" template_args: voice: "en_US/ljspeech_low" piper: command: | script/server "${model}" template_args: model: "${data_dir}/en-us-blizzard_lessac-medium.onnx" larynx: command: | script/server --voices-dir "${data_dir}" --host "${host}" template_args: host: "127.0.0.1" coqui-tts: command: | script/server # ----------------------------------------------------------------------------- # Example satellites # satellites: # default: # mic: # name: arecord # wake: # name: porcupine1 # remote: # name: websocket # snd: # name: aplay # ----------------------------------------------------------------------------- # Example pipelines pipelines: # English (default) default: mic: name: arecord wake: name: porcupine1 vad: name: silero asr: name: faster-whisper # intent: # name: regex handle: name: repeat tts: name: piper snd: name: aplay # # German # de: # inherit: default # asr: # name: faster-whisper # template_args: # language: de # tts: # name: piper # template_args: # model: "${data_dir}/de-thorsten-low.onnx" # # French # fr: # inherit: default # asr: # name: faster-whisper # template_args: # language: fr # tts: # name: piper # template_args: # model: "${data_dir}/fr-siwis-low.onnx" # # Spanish # es: # inherit: default # asr: # name: faster-whisper # template_args: # language: es # tts: # name: piper # template_args: # model: "${data_dir}/es-carlfm-low.onnx" # # Italian # it: # inherit: default # asr: # name: faster-whisper # template_args: # language: it # tts: # name: piper # template_args: # model: "${data_dir}/it-riccardo_fasol-low.onnx" # # Catalan # ca: # inherit: default # asr: # name: faster-whisper # template_args: # language: ca # tts: # name: piper # template_args: # model: "${data_dir}/ca-upc_ona-low.onnx" # # Danish # da: # inherit: default # asr: # name: faster-whisper # template_args: # language: da # tts: # name: piper # template_args: # model: "${data_dir}/da-nst_talesyntese-medium.onnx" # # Dutch # nl: # inherit: default # asr: # name: faster-whisper # template_args: # language: nl # tts: # name: piper # template_args: # model: "${data_dir}/nl-nathalie-low.onnx" # # Norwegian # no: # inherit: default # asr: # name: faster-whisper # template_args: # language: no # tts: # name: piper # template_args: # model: "${data_dir}/no-talesyntese-medium.onnx" # # Ukrainian # uk: # inherit: default # asr: # name: faster-whisper # template_args: # language: uk # tts: # name: piper # template_args: # model: "${data_dir}/uk-lada-low.onnx" # # Vietnamese # vi: # inherit: default # asr: # name: faster-whisper # template_args: # language: vi # tts: # name: piper # template_args: # model: "${data_dir}/vi-vivos-low.onnx" # # Chinese # zh: # inherit: default # asr: # name: faster-whisper # template_args: # language: zh # tts: # name: piper # template_args: # model: "${data_dir}/zh-cn-huayan-low.onnx" ================================================ FILE: rhasspy3/core.py ================================================ import logging from dataclasses import dataclass from pathlib import Path from typing import Any, Dict, Union from .config import Config from .util import merge_dict from .util.jaml import safe_load _DIR = Path(__file__).parent _DEFAULT_CONFIG = _DIR / "configuration.yaml" _LOGGER = logging.getLogger(__name__) @dataclass class Rhasspy: config: Config config_dir: Path base_dir: Path config_dict: Dict[str, Any] @property def programs_dir(self) -> Path: """Directory where programs are installed.""" return self.config_dir / "programs" @property def data_dir(self) -> Path: """Directory where models are downloaded.""" return self.config_dir / "data" @staticmethod def load(config_dir: Union[str, Path]) -> "Rhasspy": """Load and merge configuration.yaml files from rhasspy3 and config dir.""" config_dir = Path(config_dir) config_paths = [ _DEFAULT_CONFIG, config_dir / "configuration.yaml", ] config_dict: Dict[str, Any] = {} for config_path in config_paths: if config_path.exists(): _LOGGER.debug("Loading config from %s", config_path) with config_path.open(encoding="utf-8") as config_file: merge_dict(config_dict, safe_load(config_file)) else: _LOGGER.debug("Skipping %s", config_path) return Rhasspy( config=Config.from_dict(config_dict), config_dir=config_dir, config_dict=config_dict, base_dir=_DIR.parent, ) ================================================ FILE: rhasspy3/event.py ================================================ import asyncio import json import sys from abc import ABC, abstractmethod from dataclasses import dataclass, field from typing import IO, Any, Dict, Iterable, Optional _TYPE = "type" _DATA = "data" _PAYLOAD_LENGTH = "payload_length" _NEWLINE = "\n".encode() @dataclass class Event: type: str data: Dict[str, Any] = field(default_factory=dict) payload: Optional[bytes] = None def to_dict(self) -> Dict[str, Any]: return {_TYPE: self.type, _DATA: self.data} @staticmethod def from_dict(event_dict: Dict[str, Any]) -> "Event": return Event(type=event_dict["type"], data=event_dict.get("data", {})) class Eventable(ABC): @abstractmethod def event(self) -> Event: pass @staticmethod @abstractmethod def is_type(event_type: str) -> bool: pass def to_dict(self) -> Dict[str, Any]: return self.event().data async def async_read_event(reader: asyncio.StreamReader) -> Optional[Event]: try: json_line = await reader.readline() if not json_line: return None event_dict = json.loads(json_line) payload_length = event_dict.get(_PAYLOAD_LENGTH) payload: Optional[bytes] = None if payload_length is not None: payload = await reader.readexactly(payload_length) return Event( type=event_dict[_TYPE], data=event_dict.get(_DATA), payload=payload ) except KeyboardInterrupt: pass return None async def async_write_event(event: Event, writer: asyncio.StreamWriter): event_dict: Dict[str, Any] = event.to_dict() if event.payload: event_dict[_PAYLOAD_LENGTH] = len(event.payload) json_line = json.dumps(event_dict, ensure_ascii=False) try: writer.writelines((json_line.encode(), _NEWLINE)) if event.payload: writer.write(event.payload) await writer.drain() except KeyboardInterrupt: pass async def async_write_events(events: Iterable[Event], writer: asyncio.StreamWriter): coros = [] for event in events: event_dict: Dict[str, Any] = event.to_dict() if event.payload: event_dict[_PAYLOAD_LENGTH] = len(event.payload) json_line = json.dumps(event_dict, ensure_ascii=False) writer.writelines((json_line.encode(), _NEWLINE)) if event.payload: writer.write(event.payload) coros.append(writer.drain()) try: await asyncio.gather(*coros) except KeyboardInterrupt: pass def read_event(reader: Optional[IO[bytes]] = None) -> Optional[Event]: if reader is None: reader = sys.stdin.buffer try: json_line = reader.readline() if not json_line: return None event_dict = json.loads(json_line) payload_length = event_dict.get(_PAYLOAD_LENGTH) payload: Optional[bytes] = None if payload_length is not None: payload = reader.read(payload_length) return Event( type=event_dict[_TYPE], data=event_dict.get(_DATA), payload=payload ) except KeyboardInterrupt: pass return None def write_event(event: Event, writer: Optional[IO[bytes]] = None): if writer is None: writer = sys.stdout.buffer event_dict: Dict[str, Any] = event.to_dict() if event.payload: event_dict[_PAYLOAD_LENGTH] = len(event.payload) json_line = json.dumps(event_dict, ensure_ascii=False) try: writer.writelines((json_line.encode(), _NEWLINE)) if event.payload: writer.write(event.payload) writer.flush() except KeyboardInterrupt: pass ================================================ FILE: rhasspy3/handle.py ================================================ """Intent recognition and handling.""" import logging from dataclasses import dataclass from typing import Any, Dict, Optional, Union from .asr import Transcript from .config import PipelineProgramConfig from .core import Rhasspy from .event import Event, Eventable, async_read_event, async_write_event from .intent import Intent, NotRecognized from .program import create_process DOMAIN = "handle" _HANDLED_TYPE = "handled" _NOT_HANDLED_TYPE = "not-handled" _LOGGER = logging.getLogger(__name__) @dataclass class Handled(Eventable): text: Optional[str] = None @staticmethod def is_type(event_type: str) -> bool: return event_type == _HANDLED_TYPE def event(self) -> Event: data: Dict[str, Any] = {} if self.text is not None: data["text"] = self.text return Event(type=_HANDLED_TYPE, data=data) @staticmethod def from_event(event: Event) -> "Handled": assert event.data is not None return Handled(text=event.data.get("text")) @dataclass class NotHandled(Eventable): text: Optional[str] = None @staticmethod def is_type(event_type: str) -> bool: return event_type == _NOT_HANDLED_TYPE def event(self) -> Event: data: Dict[str, Any] = {} if self.text is not None: data["text"] = self.text return Event(type=_NOT_HANDLED_TYPE, data=data) @staticmethod def from_event(event: Event) -> "NotHandled": assert event.data is not None return NotHandled(text=event.data.get("text")) async def handle( rhasspy: Rhasspy, program: Union[str, PipelineProgramConfig], handle_input: Union[Intent, NotRecognized, Transcript], ) -> Optional[Union[Handled, NotHandled]]: handle_result: Optional[Union[Handled, NotHandled]] = None async with (await create_process(rhasspy, DOMAIN, program)) as handle_proc: assert handle_proc.stdin is not None assert handle_proc.stdout is not None _LOGGER.debug("handle: input=%s", handle_input) await async_write_event(handle_input.event(), handle_proc.stdin) while True: event = await async_read_event(handle_proc.stdout) if event is None: break if Handled.is_type(event.type): handle_result = Handled.from_event(event) elif NotHandled.is_type(event.type): handle_result = NotHandled.from_event(event) _LOGGER.debug("handle: %s", handle_result) return handle_result ================================================ FILE: rhasspy3/intent.py ================================================ """Intent recognition and handling.""" import logging from dataclasses import asdict, dataclass, field from typing import Any, Dict, List, Optional, Union from .config import PipelineProgramConfig from .core import Rhasspy from .event import Event, Eventable, async_read_event, async_write_event from .program import create_process DOMAIN = "intent" _RECOGNIZE_TYPE = "recognize" _INTENT_TYPE = "intent" _NOT_RECOGNIZED_TYPE = "not-recognized" _LOGGER = logging.getLogger(__name__) @dataclass class Entity: name: str value: Optional[Any] = None @dataclass class Recognize(Eventable): text: str @staticmethod def is_type(event_type: str) -> bool: return event_type == _RECOGNIZE_TYPE def event(self) -> Event: data: Dict[str, Any] = {"text": self.text} return Event(type=_RECOGNIZE_TYPE, data=data) @staticmethod def from_event(event: Event) -> "Recognize": assert event.data is not None return Recognize(text=event.data["text"]) @dataclass class Intent(Eventable): name: str entities: List[Entity] = field(default_factory=list) @staticmethod def is_type(event_type: str) -> bool: return event_type == _INTENT_TYPE def event(self) -> Event: data: Dict[str, Any] = {"name": self.name} if self.entities: data["entities"] = [asdict(entity) for entity in self.entities] return Event(type=_INTENT_TYPE, data=data) @staticmethod def from_dict(data: Dict[str, Any]) -> "Intent": entity_dicts = data.get("entities") if entity_dicts: entities: List[Entity] = [ Entity(**entity_dict) for entity_dict in entity_dicts ] else: entities = [] return Intent(name=data["name"], entities=entities) @staticmethod def from_event(event: Event) -> "Intent": assert event.data is not None return Intent.from_dict(event.data) def to_rhasspy(self) -> Dict[str, Any]: return { "intent": { "name": self.name, }, "entities": [ {"entity": entity.name, "value": entity.value} for entity in self.entities ], "slots": {entity.name: entity.value for entity in self.entities}, } @dataclass class NotRecognized(Eventable): text: Optional[str] = None @staticmethod def is_type(event_type: str) -> bool: return event_type == _NOT_RECOGNIZED_TYPE def event(self) -> Event: data: Dict[str, Any] = {} if self.text is not None: data["text"] = self.text return Event(type=_NOT_RECOGNIZED_TYPE, data=data) @staticmethod def from_event(event: Event) -> "NotRecognized": assert event.data is not None return NotRecognized(text=event.data.get("text")) async def recognize( rhasspy: Rhasspy, program: Union[str, PipelineProgramConfig], text: str ) -> Optional[Union[Intent, NotRecognized]]: result: Optional[Union[Intent, NotRecognized]] = None async with (await create_process(rhasspy, DOMAIN, program)) as intent_proc: assert intent_proc.stdin is not None assert intent_proc.stdout is not None _LOGGER.debug("recognize: text='%s'", text) await async_write_event(Recognize(text=text).event(), intent_proc.stdin) while True: intent_event = await async_read_event(intent_proc.stdout) if intent_event is None: break if Intent.is_type(intent_event.type): result = Intent.from_event(intent_event) break if NotRecognized.is_type(intent_event.type): result = NotRecognized.from_event(intent_event) break _LOGGER.debug("recognize: %s", result) return result ================================================ FILE: rhasspy3/mic.py ================================================ """Audio input from a microphone.""" DOMAIN = "mic" ================================================ FILE: rhasspy3/pipeline.py ================================================ """Full voice loop (pipeline).""" import io import logging from collections import deque from dataclasses import dataclass, fields from enum import Enum from typing import IO, Any, Deque, Dict, Optional, Union from .asr import DOMAIN as ASR_DOMAIN from .asr import Transcript, transcribe from .config import CommandConfig, PipelineConfig, PipelineProgramConfig from .core import Rhasspy from .event import Event, Eventable, async_read_event from .handle import Handled, NotHandled, handle from .intent import Intent, NotRecognized, recognize from .mic import DOMAIN as MIC_DOMAIN from .program import create_process, run_command from .snd import play from .tts import synthesize from .util.dataclasses_json import DataClassJsonMixin from .vad import segment from .wake import Detection, detect _LOGGER = logging.getLogger(__name__) @dataclass class PipelineResult(DataClassJsonMixin): """Result of running all or part of a pipeline.""" wake_detection: Optional[Detection] = None asr_transcript: Optional[Transcript] = None intent_result: Optional[Union[Intent, NotRecognized]] = None handle_result: Optional[Union[Handled, NotHandled]] = None def to_event_dict(self) -> Dict[str, Any]: event_dict: Dict[str, Any] = {} for field in fields(self): value = getattr(self, field.name) if value is None: event_dict[field.name] = {} else: assert isinstance(value, Eventable) event_dict[field.name] = value.event().to_dict() return event_dict class StopAfterDomain(str, Enum): WAKE = "wake" ASR = "asr" INTENT = "intent" HANDLE = "handle" TTS = "tts" async def run( rhasspy: Rhasspy, pipeline: Union[str, PipelineConfig], samples_per_chunk: int, asr_chunks_to_buffer: int = 0, mic_program: Optional[Union[str, PipelineProgramConfig]] = None, wake_program: Optional[Union[str, PipelineProgramConfig]] = None, wake_detection: Optional[Detection] = None, asr_program: Optional[Union[str, PipelineProgramConfig]] = None, asr_wav_in: Optional[IO[bytes]] = None, asr_transcript: Optional[Transcript] = None, vad_program: Optional[Union[str, PipelineProgramConfig]] = None, intent_result: Optional[Union[Intent, NotRecognized]] = None, intent_program: Optional[Union[str, PipelineProgramConfig]] = None, handle_result: Optional[Union[Handled, NotHandled]] = None, handle_program: Optional[Union[str, PipelineProgramConfig]] = None, tts_wav_in: Optional[IO[bytes]] = None, tts_program: Optional[Union[str, PipelineProgramConfig]] = None, snd_program: Optional[Union[str, PipelineProgramConfig]] = None, stop_after: Optional[StopAfterDomain] = None, ) -> PipelineResult: """Run a full or partial pipeline.""" pipeline_result = PipelineResult() if isinstance(pipeline, str): pipeline = rhasspy.config.pipelines[pipeline] mic_program = mic_program or pipeline.mic wake_program = wake_program or pipeline.wake wake_after = pipeline.wake.after if pipeline.wake else None asr_program = asr_program or pipeline.asr asr_after = pipeline.asr.after if pipeline.asr else None vad_program = vad_program or pipeline.vad intent_program = intent_program or pipeline.intent handle_program = handle_program or pipeline.handle tts_program = tts_program or pipeline.tts snd_program = snd_program or pipeline.snd skip_asr = ( (intent_result is not None) or (handle_result is not None) or (tts_wav_in is not None) ) if not skip_asr: # Speech to text if asr_wav_in is not None: # WAV input if stop_after == StopAfterDomain.WAKE: return pipeline_result asr_wav_in.seek(0) assert asr_program is not None, "No asr program" asr_transcript = await transcribe( rhasspy, asr_program, asr_wav_in, samples_per_chunk ) if asr_after is not None: await run_command(rhasspy, asr_after) elif asr_transcript is None: # Mic input assert mic_program is not None, "No asr program" if wake_program is None: # No wake assert asr_program is not None, "No asr program" assert vad_program is not None, "No vad program" await _mic_asr( rhasspy, mic_program, asr_program, vad_program, pipeline_result ) elif stop_after == StopAfterDomain.WAKE: # Audio input, wake word detection, segmentation, speech to text assert wake_program is not None, "No vad program" await _mic_wake( rhasspy, mic_program, wake_program, pipeline_result, wake_detection=wake_detection, ) return pipeline_result else: assert wake_program is not None, "No vad program" assert asr_program is not None, "No asr program" assert vad_program is not None, "No vad program" await _mic_wake_asr( rhasspy, mic_program, wake_program, asr_program, vad_program, pipeline_result, asr_chunks_to_buffer=asr_chunks_to_buffer, wake_detection=wake_detection, wake_after=wake_after, ) if asr_after is not None: await run_command(rhasspy, asr_after) asr_transcript = pipeline_result.asr_transcript pipeline_result.asr_transcript = asr_transcript if (stop_after == StopAfterDomain.ASR) or ( (intent_program is None) and (handle_program is None) ): return pipeline_result # Text to intent if (asr_transcript is not None) and (intent_program is not None): pipeline_result.asr_transcript = asr_transcript intent_result = await recognize( rhasspy, intent_program, asr_transcript.text or "" ) pipeline_result.intent_result = intent_result # Handle intent handle_input: Optional[Union[Intent, NotRecognized, Transcript]] = None if intent_result is not None: pipeline_result.intent_result = intent_result handle_input = intent_result elif asr_transcript is not None: handle_input = asr_transcript if (stop_after == StopAfterDomain.INTENT) or (handle_program is None): return pipeline_result if (handle_input is not None) and (handle_result is None): assert handle_program is not None, "Pipeline is missing handle" handle_result = await handle(rhasspy, handle_program, handle_input) pipeline_result.handle_result = handle_result if (stop_after == StopAfterDomain.HANDLE) or (tts_program is None): return pipeline_result # Text to speech if handle_result is not None: pipeline_result.handle_result = handle_result if handle_result.text: assert tts_program is not None, "Pipeline is missing tts" tts_wav_in = io.BytesIO() await synthesize(rhasspy, tts_program, handle_result.text, tts_wav_in) else: _LOGGER.debug("No text returned from handle") if (stop_after == StopAfterDomain.TTS) or (snd_program is None): return pipeline_result # Audio output if tts_wav_in is not None: tts_wav_in.seek(0) assert snd_program is not None, "Pipeline is missing snd" await play(rhasspy, snd_program, tts_wav_in, samples_per_chunk) return pipeline_result async def _mic_wake( rhasspy: Rhasspy, mic_program: Union[str, PipelineProgramConfig], wake_program: Union[str, PipelineProgramConfig], pipeline_result: PipelineResult, wake_detection: Optional[Detection] = None, ): """Just wake word detection.""" async with (await create_process(rhasspy, MIC_DOMAIN, mic_program)) as mic_proc: assert mic_proc.stdout is not None if wake_detection is None: wake_detection = await detect( rhasspy, wake_program, mic_proc.stdout, ) if wake_detection is not None: pipeline_result.wake_detection = wake_detection else: _LOGGER.debug("run: no wake word detected") async def _mic_asr( rhasspy: Rhasspy, mic_program: Union[str, PipelineProgramConfig], asr_program: Union[str, PipelineProgramConfig], vad_program: Union[str, PipelineProgramConfig], pipeline_result: PipelineResult, asr_chunks_to_buffer: int = 0, ): """Just asr transcription (+ silence detection).""" async with (await create_process(rhasspy, MIC_DOMAIN, mic_program)) as mic_proc, ( await create_process(rhasspy, ASR_DOMAIN, asr_program) ) as asr_proc: assert mic_proc.stdout is not None assert asr_proc.stdin is not None assert asr_proc.stdout is not None await segment( rhasspy, vad_program, mic_proc.stdout, asr_proc.stdin, ) while True: asr_event = await async_read_event(asr_proc.stdout) if asr_event is None: break if Transcript.is_type(asr_event.type): pipeline_result.asr_transcript = Transcript.from_event(asr_event) break async def _mic_wake_asr( rhasspy: Rhasspy, mic_program: Union[str, PipelineProgramConfig], wake_program: Union[str, PipelineProgramConfig], asr_program: Union[str, PipelineProgramConfig], vad_program: Union[str, PipelineProgramConfig], pipeline_result: PipelineResult, asr_chunks_to_buffer: int = 0, wake_detection: Optional[Detection] = None, wake_after: Optional[CommandConfig] = None, ): """Wake word detect + asr transcription (+ silence detection).""" chunk_buffer: Optional[Deque[Event]] = ( deque(maxlen=asr_chunks_to_buffer) if asr_chunks_to_buffer > 0 else None ) async with (await create_process(rhasspy, MIC_DOMAIN, mic_program)) as mic_proc, ( await create_process(rhasspy, ASR_DOMAIN, asr_program) ) as asr_proc: assert mic_proc.stdout is not None assert asr_proc.stdin is not None assert asr_proc.stdout is not None if wake_detection is None: wake_detection = await detect( rhasspy, wake_program, mic_proc.stdout, chunk_buffer ) if wake_detection is not None: if wake_after is not None: await run_command(rhasspy, wake_after) pipeline_result.wake_detection = wake_detection await segment( rhasspy, vad_program, mic_proc.stdout, asr_proc.stdin, chunk_buffer, ) while True: asr_event = await async_read_event(asr_proc.stdout) if asr_event is None: break if Transcript.is_type(asr_event.type): pipeline_result.asr_transcript = Transcript.from_event(asr_event) break else: _LOGGER.debug("run: no wake word detected") ================================================ FILE: rhasspy3/program.py ================================================ """Utilities for creating processes.""" import asyncio import logging import os import shlex import string from asyncio.subprocess import PIPE, Process from typing import Optional, Union from .config import CommandConfig, PipelineProgramConfig, ProgramConfig from .core import Rhasspy from .util import merge_dict _LOGGER = logging.getLogger(__name__) class MissingProgramConfigError(Exception): pass class ProcessContextManager: """Wrapper for an async process that terminates on exit.""" def __init__(self, proc: Process, name: str): self.proc = proc self.name = name async def __aenter__(self): return self.proc async def __aexit__(self, exc_type, exc, tb): try: if self.proc.returncode is None: self.proc.terminate() await self.proc.wait() except ProcessLookupError: # Expected when process has already exited pass except Exception: _LOGGER.exception("Unexpected error stopping process: %s", self.name) async def create_process( rhasspy: Rhasspy, domain: str, name: Union[str, PipelineProgramConfig] ) -> ProcessContextManager: pipeline_config: Optional[PipelineProgramConfig] = None if isinstance(name, PipelineProgramConfig): pipeline_config = name name = pipeline_config.name assert name, f"No program name for domain {domain}" # The "." is special in program names: # it means to use the directory of "base" in .. # # This is used for .client programs, which are just scripts in the # "base" directory that communicate with their respective servers. if "." in name: base_name = name.split(".", maxsplit=1)[0] else: base_name = name program_config: Optional[ProgramConfig] = rhasspy.config.programs.get( domain, {} ).get(name) assert program_config is not None, f"No config for program {domain}/{name}" assert isinstance(program_config, ProgramConfig) # Directory where this program is installed program_dir = rhasspy.programs_dir / domain / base_name # Directory where this program should store data data_dir = rhasspy.data_dir / domain / base_name # ${variables} available within program/pipeline template_args default_mapping = { "program_dir": str(program_dir.absolute()), "data_dir": str(data_dir.absolute()), } command_str = program_config.command.strip() command_mapping = dict(default_mapping) if program_config.template_args: # Substitute within program template args args_mapping = dict(program_config.template_args) for arg_name, arg_str in args_mapping.items(): if not isinstance(arg_str, str): continue arg_template = string.Template(arg_str) args_mapping[arg_name] = arg_template.safe_substitute(default_mapping) command_mapping.update(args_mapping) if pipeline_config is not None: if pipeline_config.template_args: # Substitute within pipeline template args args_mapping = dict(pipeline_config.template_args) for arg_name, arg_str in args_mapping.items(): if not isinstance(arg_str, str): continue arg_template = string.Template(arg_str) args_mapping[arg_name] = arg_template.safe_substitute(default_mapping) merge_dict(command_mapping, args_mapping) # Substitute template args command_template = string.Template(command_str) command_str = command_template.safe_substitute(command_mapping) working_dir = rhasspy.programs_dir / domain / base_name env = dict(os.environ) # Add rhasspy3/bin to $PATH env["PATH"] = f'{rhasspy.base_dir}/bin:${env["PATH"]}' # Ensure stdout is flushed for Python programs env["PYTHONUNBUFFERED"] = "1" cwd = working_dir if working_dir.is_dir() else None if program_config.shell: if program_config.adapter: program, *args = shlex.split(program_config.adapter) args.append("--shell") args.append(command_str) _LOGGER.debug("(shell): %s %s", program, args) proc = await asyncio.create_subprocess_exec( program, *args, stdin=PIPE, stdout=PIPE, cwd=cwd, env=env, ) else: _LOGGER.debug("(shell): %s %s", program, args) proc = await asyncio.create_subprocess_shell( command_str, stdin=PIPE, stdout=PIPE, cwd=cwd, env=env, ) else: if program_config.adapter: program, *args = shlex.split(program_config.adapter) args.append(command_str) else: program, *args = shlex.split(command_str) _LOGGER.debug("%s %s", program, args) proc = await asyncio.create_subprocess_exec( program, *args, stdin=PIPE, stdout=PIPE, cwd=cwd, env=env, ) return ProcessContextManager(proc, name=name) async def run_command(rhasspy: Rhasspy, command_config: CommandConfig) -> int: env = dict(os.environ) # Add rhasspy3/bin to $PATH env["PATH"] = f'{rhasspy.base_dir}/bin:${env["PATH"]}' # Ensure stdout is flushed for Python programs env["PYTHONUNBUFFERED"] = "1" if command_config.shell: proc = await asyncio.create_subprocess_shell( command_config.command, env=env, ) else: program, *args = shlex.split(command_config.command) proc = await asyncio.create_subprocess_exec( program, *args, env=env, ) await proc.wait() assert proc.returncode is not None return proc.returncode ================================================ FILE: rhasspy3/py.typed ================================================ ================================================ FILE: rhasspy3/remote.py ================================================ """Remote communication with a base station.""" DOMAIN = "remote" ================================================ FILE: rhasspy3/snd.py ================================================ """Audio output to speakers.""" import wave from dataclasses import dataclass from typing import IO, AsyncIterable, Optional, Union from .audio import AudioChunk, AudioStop, wav_to_chunks from .config import PipelineProgramConfig from .core import Rhasspy from .event import Event, Eventable, async_read_event, async_write_event from .program import create_process DOMAIN = "snd" _PLAYED_TYPE = "played" @dataclass class Played(Eventable): @staticmethod def is_type(event_type: str) -> bool: return event_type == _PLAYED_TYPE def event(self) -> Event: return Event(type=_PLAYED_TYPE) @staticmethod def from_event(event: Event) -> "Played": return Played() async def play( rhasspy: Rhasspy, program: Union[str, PipelineProgramConfig], wav_in: IO[bytes], samples_per_chunk: int, ) -> Optional[Played]: wav_file: wave.Wave_read = wave.open(wav_in, "rb") with wav_file: async with (await create_process(rhasspy, DOMAIN, program)) as snd_proc: assert snd_proc.stdin is not None assert snd_proc.stdout is not None timestamp: Optional[int] = None for chunk in wav_to_chunks(wav_file, samples_per_chunk=samples_per_chunk): await async_write_event(chunk.event(), snd_proc.stdin) timestamp = chunk.timestamp await async_write_event( AudioStop(timestamp=timestamp).event(), snd_proc.stdin ) # Wait for confimation while True: event = await async_read_event(snd_proc.stdout) if event is None: break if Played.is_type(event.type): return Played.from_event(event) return None async def play_stream( rhasspy: Rhasspy, program: Union[str, PipelineProgramConfig], audio_stream: AsyncIterable[bytes], rate: int, width: int, channels: int, ) -> Optional[Played]: async with (await create_process(rhasspy, DOMAIN, program)) as snd_proc: assert snd_proc.stdin is not None assert snd_proc.stdout is not None async for audio_bytes in audio_stream: chunk = AudioChunk(rate, width, channels, audio_bytes) await async_write_event(chunk.event(), snd_proc.stdin) await async_write_event(AudioStop().event(), snd_proc.stdin) # Wait for confimation while True: event = await async_read_event(snd_proc.stdout) if event is None: break if Played.is_type(event.type): return Played.from_event(event) return None ================================================ FILE: rhasspy3/tts.py ================================================ """Text to speech.""" import wave from dataclasses import dataclass from typing import IO, AsyncIterable, Union from .audio import AudioChunk, AudioStart, AudioStop from .config import PipelineProgramConfig from .core import Rhasspy from .event import Event, Eventable, async_read_event, async_write_event from .program import create_process DOMAIN = "tts" _SYNTHESIZE_TYPE = "synthesize" @dataclass class Synthesize(Eventable): """Request to synthesize audio from text.""" text: str """Text to synthesize.""" @staticmethod def is_type(event_type: str) -> bool: return event_type == _SYNTHESIZE_TYPE def event(self) -> Event: return Event(type=_SYNTHESIZE_TYPE, data={"text": self.text}) @staticmethod def from_event(event: Event) -> "Synthesize": assert event.data is not None return Synthesize(text=event.data["text"]) async def synthesize( rhasspy: Rhasspy, program: Union[str, PipelineProgramConfig], text: str, wav_out: IO[bytes], ): """Synthesize audio from text to WAV output.""" async with (await create_process(rhasspy, DOMAIN, program)) as tts_proc: assert tts_proc.stdin is not None assert tts_proc.stdout is not None await async_write_event(Synthesize(text=text).event(), tts_proc.stdin) wav_file: wave.Wave_write = wave.open(wav_out, "wb") wav_params_set = False with wav_file: while True: event = await async_read_event(tts_proc.stdout) if event is None: break if AudioStart.is_type(event.type): if not wav_params_set: start = AudioStart.from_event(event) wav_file.setframerate(start.rate) wav_file.setsampwidth(start.width) wav_file.setnchannels(start.channels) wav_params_set = True elif AudioChunk.is_type(event.type): chunk = AudioChunk.from_event(event) if not wav_params_set: wav_file.setframerate(chunk.rate) wav_file.setsampwidth(chunk.width) wav_file.setnchannels(chunk.channels) wav_params_set = True wav_file.writeframes(chunk.audio) elif AudioStop.is_type(event.type): break async def synthesize_stream( rhasspy: Rhasspy, program: Union[str, PipelineProgramConfig], text: str, ) -> AsyncIterable[AudioChunk]: """Synthesize audio from text to a raw stream.""" async with (await create_process(rhasspy, DOMAIN, program)) as tts_proc: assert tts_proc.stdin is not None assert tts_proc.stdout is not None await async_write_event(Synthesize(text=text).event(), tts_proc.stdin) while True: event = await async_read_event(tts_proc.stdout) if event is None: break if AudioChunk.is_type(event.type): yield AudioChunk.from_event(event) elif AudioStop.is_type(event.type): break ================================================ FILE: rhasspy3/util/__init__.py ================================================ import collections def merge_dict(base_dict, new_dict): """Merges new_dict into base_dict.""" for key, value in new_dict.items(): if key in base_dict: old_value = base_dict[key] if isinstance(old_value, collections.abc.MutableMapping): # Combine dictionary assert isinstance( value, collections.abc.Mapping ), f"Not a dict: {value}" merge_dict(old_value, value) elif isinstance(old_value, collections.abc.MutableSequence): # Combine list assert isinstance( value, collections.abc.Sequence ), f"Not a list: {value}" old_value.extend(value) else: # Overwrite base_dict[key] = value else: base_dict[key] = value ================================================ FILE: rhasspy3/util/dataclasses_json.py ================================================ """Implement a tiny subset of dataclasses_json for config.""" from collections.abc import Mapping, Sequence from dataclasses import asdict, fields, is_dataclass from typing import Any, Dict, Type class DataClassJsonMixin: """Adds from_dict to dataclass.""" @classmethod def from_dict(cls, data: Dict[str, Any]) -> Any: """Parse dataclasses recursively.""" kwargs: Dict[str, Any] = {} cls_fields = {field.name: field for field in fields(cls)} for key, value in data.items(): field = cls_fields[key] if is_dataclass(field.type): assert issubclass(field.type, DataClassJsonMixin), field.type kwargs[key] = field.type.from_dict(value) else: kwargs[key] = _decode(value, field.type) return cls(**kwargs) def to_dict(self) -> Dict[str, Any]: """Alias for asdict.""" return asdict(self) def _decode(value: Any, target_type: Type) -> Any: """Decode value using (possibly generic) type.""" if is_dataclass(target_type): assert issubclass(target_type, DataClassJsonMixin), target_type return target_type.from_dict(value) if value is not None else None if hasattr(target_type, "__args__"): # Optional[T] if type(None) in target_type.__args__: optional_type = target_type.__args__[0] return _decode(value, optional_type) # List[T] if isinstance(value, Sequence): list_type = target_type.__args__[0] return [_decode(item, list_type) for item in value] # Dict[str, T] if isinstance(value, Mapping): value_type = target_type.__args__[1] return { map_key: _decode(map_value, value_type) for map_key, map_value in value.items() } return value ================================================ FILE: rhasspy3/util/jaml.py ================================================ """JAML is JSON objects as a *severely* restricted subset of YAML.""" from collections.abc import Mapping from enum import Enum, auto from typing import IO, Any, Dict, List, Union _INDENT = 2 def safe_load(fp: IO[str]) -> Dict[str, Any]: loader = JamlLoader() for line in fp: loader.process_line(line) return loader.output class LoaderState(Enum): IN_DICT = auto() LITERAL = auto() class JamlLoader: def __init__(self) -> None: self.output: Dict[str, Any] = {} self.indent = 0 self.state = LoaderState.IN_DICT self.literal = "" self.target_stack: List[Union[Dict[str, Any], str]] = [self.output] def process_line(self, line: str): line_stripped = line.strip() if line_stripped.startswith("#") or (not line_stripped): # Comment or empty line return line_indent = len(line) - len(line.lstrip()) if self.state == LoaderState.LITERAL: # Multi-line literal if line_indent < self.indent: # Done with literal assert len(self.target_stack) > 1 key = self.target_stack.pop() assert isinstance(key, str) target = self.target_stack[-1] assert isinstance(target, Mapping) target[key] = self.literal.strip() # Reset indent and state self.indent -= _INDENT self.state = LoaderState.IN_DICT else: # Add to literal self.literal += "\n" + line.strip() if self.state == LoaderState.IN_DICT: self._add_key(line, line_indent) def _add_key(self, line, line_indent: int): while line_indent < self.indent: self.target_stack.pop() self.indent -= _INDENT assert self.target_stack target = self.target_stack[-1] assert isinstance(target, Mapping) parts = line.split(":", maxsplit=1) assert len(parts) == 2 key = parts[0].strip() value = parts[1].strip() assert not key.startswith("-"), "Lists are not supported" # Remove inline comments if value and (value[0] in ("'", '"')): # Just keep what's in quotes. # This doesn't take escapes, etc. into account. end_quote = value.find(value[0], 1) value = value[: end_quote + 1] else: # Remove comment value = value.split("#", maxsplit=1)[0] value_is_dict = True if value: value_is_dict = False if value[0] in ("'", '"'): # Remove quotes value = value[1:-1] elif value == "|": self.literal = "" self.target_stack.append(key) self.indent += _INDENT self.state = LoaderState.LITERAL return elif value.lower() in ("true", "false"): value = value.lower() == "true" else: try: value = int(value) except ValueError: try: value = float(value) except ValueError: pass if value_is_dict: new_target: Dict[str, Any] = {} target[key] = new_target self.target_stack.append(new_target) self.indent += _INDENT else: target[key] = value ================================================ FILE: rhasspy3/vad.py ================================================ """Voice activity detection.""" import asyncio import logging import time from dataclasses import dataclass from typing import Iterable, Optional, Union from .audio import AudioChunk, AudioStop from .config import PipelineProgramConfig from .core import Rhasspy from .event import Event, Eventable, async_read_event, async_write_event from .program import create_process DOMAIN = "vad" _STARTED_TYPE = "voice-started" _STOPPED_TYPE = "voice-stopped" _LOGGER = logging.getLogger(__name__) @dataclass class VoiceStarted(Eventable): """User has started speaking.""" timestamp: Optional[int] = None """Milliseconds""" @staticmethod def is_type(event_type: str) -> bool: return event_type == _STARTED_TYPE def event(self) -> Event: return Event( type=_STARTED_TYPE, data={"timestamp": self.timestamp}, ) @staticmethod def from_event(event: Event) -> "VoiceStarted": return VoiceStarted(timestamp=event.data.get("timestamp")) @dataclass class VoiceStopped(Eventable): """User has stopped speaking.""" timestamp: Optional[int] = None """Milliseconds""" @staticmethod def is_type(event_type: str) -> bool: return event_type == _STOPPED_TYPE def event(self) -> Event: return Event( type=_STOPPED_TYPE, data={"timestamp": self.timestamp}, ) @staticmethod def from_event(event: Event) -> "VoiceStopped": return VoiceStopped(timestamp=event.data.get("timestamp")) @dataclass class Segmenter: """Segments an audio stream by speech.""" speech_seconds: float """Seconds of speech before voice command has started.""" silence_seconds: float """Seconds of silence after voice command has ended.""" timeout_seconds: float """Maximum number of seconds before stopping with timeout=True.""" reset_seconds: float """Seconds before reset start/stop time counters.""" started: bool = False """True if user has started speaking""" start_timestamp: Optional[int] = None """Timestamp when user started speaking.""" stopped: bool = False """True if user has stopped speaking""" stop_timestamp: Optional[int] = None """Timestamp when user stopped speaking.""" timeout: bool = False """True if stopping was due to timeout.""" _in_command: bool = False """True if inside voice command.""" _speech_seconds_left: float = 0.0 """Seconds left before considering voice command as started.""" _silence_seconds_left: float = 0.0 """Seconds left before considering voice command as stopped.""" _timeout_seconds_left: float = 0.0 """Seconds left before considering voice command timed out.""" _reset_seconds_left: float = 0.0 """Seconds left before resetting start/stop time counters.""" def __post_init__(self): self.reset() def reset(self): """Resets all counters and state.""" self._speech_seconds_left = self.speech_seconds self._silence_seconds_left = self.silence_seconds self._timeout_seconds_left = self.timeout_seconds self._reset_seconds_left = self.reset_seconds self._in_command = False self.start_timestamp = None self.stop_timestamp = None def process( self, chunk: bytes, chunk_seconds: float, is_speech: bool, timestamp: int ): """Process a single chunk of audio.""" self._timeout_seconds_left -= chunk_seconds if self._timeout_seconds_left <= 0: self.stop_timestamp = timestamp self.timeout = True self.stopped = True return if not self._in_command: if is_speech: self._reset_seconds_left = self.reset_seconds if self.start_timestamp is None: self.start_timestamp = timestamp self._speech_seconds_left -= chunk_seconds if self._speech_seconds_left <= 0: # Inside voice command self._in_command = True self.started = True else: # Reset if enough silence self._reset_seconds_left -= chunk_seconds if self._reset_seconds_left <= 0: self._speech_seconds_left = self.speech_seconds self.start_timestamp = None else: if not is_speech: self._reset_seconds_left = self.reset_seconds self._silence_seconds_left -= chunk_seconds if self._silence_seconds_left <= 0: self.stop_timestamp = timestamp self.stopped = True else: # Reset if enough speech self._reset_seconds_left -= chunk_seconds if self._reset_seconds_left <= 0: self._silence_seconds_left = self.silence_seconds async def segment( rhasspy: Rhasspy, program: Union[str, PipelineProgramConfig], mic_in: asyncio.StreamReader, asr_out: asyncio.StreamWriter, chunk_buffer: Optional[Iterable[Event]] = None, ): """Segments an audio input stream, passing audio chunks to asr.""" async with (await create_process(rhasspy, DOMAIN, program)) as vad_proc: assert vad_proc.stdin is not None assert vad_proc.stdout is not None if chunk_buffer: # Buffered chunks from wake word detection for buffered_event in chunk_buffer: await asyncio.gather( async_write_event(buffered_event, vad_proc.stdin), async_write_event(buffered_event, asr_out), ) mic_task = asyncio.create_task(async_read_event(mic_in)) vad_task = asyncio.create_task(async_read_event(vad_proc.stdout)) pending = {mic_task, vad_task} timestamp = 0 in_command = False is_first_chunk = True while True: done, pending = await asyncio.wait( pending, return_when=asyncio.FIRST_COMPLETED ) if mic_task in done: mic_event = mic_task.result() if mic_event is None: break # Process chunk if AudioChunk.is_type(mic_event.type): if is_first_chunk: is_first_chunk = False _LOGGER.debug("segment: processing audio") chunk = AudioChunk.from_event(mic_event) timestamp = ( chunk.timestamp if chunk.timestamp is not None else time.monotonic_ns() ) if in_command: # Speech recognition and silence detection await asyncio.gather( async_write_event(mic_event, asr_out), async_write_event(mic_event, vad_proc.stdin), ) else: # Voice detection await asyncio.gather( async_write_event(mic_event, asr_out), async_write_event(mic_event, vad_proc.stdin), ) # Next chunk mic_task = asyncio.create_task(async_read_event(mic_in)) pending.add(mic_task) if vad_task in done: vad_event = vad_task.result() if vad_event is None: break if VoiceStarted.is_type(vad_event.type): if not in_command: # Start of voice command in_command = True _LOGGER.debug("segment: speaking started") elif VoiceStopped.is_type(vad_event.type): # End of voice command _LOGGER.debug("segment: speaking ended") await async_write_event( AudioStop(timestamp=timestamp).event(), asr_out ) break # Next VAD event vad_task = asyncio.create_task(async_read_event(vad_proc.stdout)) pending.add(vad_task) ================================================ FILE: rhasspy3/wake.py ================================================ """Wake word detection""" import asyncio import logging from dataclasses import dataclass from typing import AsyncIterable, MutableSequence, Optional, Union from .audio import AudioChunk, AudioStart, AudioStop from .config import PipelineProgramConfig from .core import Rhasspy from .event import Event, Eventable, async_read_event, async_write_event from .program import create_process DOMAIN = "wake" _DETECTION_TYPE = "detection" _NOT_DETECTED_TYPE = "not-detected" _LOGGER = logging.getLogger(__name__) @dataclass class Detection(Eventable): """Wake word was detected.""" name: Optional[str] = None """Name of model.""" timestamp: Optional[int] = None """Timestamp of audio chunk with detection""" @staticmethod def is_type(event_type: str) -> bool: return event_type == _DETECTION_TYPE def event(self) -> Event: return Event( type=_DETECTION_TYPE, data={"name": self.name, "timestamp": self.timestamp} ) @staticmethod def from_event(event: Event) -> "Detection": assert event.data is not None return Detection( name=event.data.get("name"), timestamp=event.data.get("timestamp") ) @dataclass class NotDetected(Eventable): """Audio stream ended before wake word was detected.""" @staticmethod def is_type(event_type: str) -> bool: return event_type == _NOT_DETECTED_TYPE def event(self) -> Event: return Event(type=_NOT_DETECTED_TYPE) @staticmethod def from_event(event: Event) -> "NotDetected": return NotDetected() async def detect( rhasspy: Rhasspy, program: Union[str, PipelineProgramConfig], mic_in: asyncio.StreamReader, chunk_buffer: Optional[MutableSequence[Event]] = None, ) -> Optional[Detection]: """Try to detect wake word in an audio stream.""" detection: Optional[Detection] = None async with (await create_process(rhasspy, DOMAIN, program)) as wake_proc: assert wake_proc.stdin is not None assert wake_proc.stdout is not None mic_task = asyncio.create_task(async_read_event(mic_in)) wake_task = asyncio.create_task(async_read_event(wake_proc.stdout)) pending = {mic_task, wake_task} is_first_chunk = True while True: done, pending = await asyncio.wait( pending, return_when=asyncio.FIRST_COMPLETED ) if mic_task in done: mic_event = mic_task.result() if mic_event is None: break if AudioChunk.is_type(mic_event.type): if is_first_chunk: is_first_chunk = False _LOGGER.debug("detect: processing audio") await async_write_event(mic_event, wake_proc.stdin) if chunk_buffer is not None: # Buffer chunks for asr chunk_buffer.append(mic_event) if detection is None: # Next chunk mic_task = asyncio.create_task(async_read_event(mic_in)) pending.add(mic_task) if detection is not None: # Ensure last mic task is finished break if wake_task in done: wake_event = wake_task.result() if wake_event is None: break if Detection.is_type(wake_event.type): detection = Detection.from_event(wake_event) else: # Next wake event wake_task = asyncio.create_task(async_read_event(wake_proc.stdout)) pending.add(wake_task) _LOGGER.debug("detect: %s", detection) return detection async def detect_stream( rhasspy: Rhasspy, program: Union[str, PipelineProgramConfig], audio_stream: AsyncIterable[bytes], rate: int, width: int, channels: int, ) -> Optional[Detection]: """Try to detect the wake word in a raw audio stream.""" async with (await create_process(rhasspy, DOMAIN, program)) as wake_proc: assert wake_proc.stdin is not None assert wake_proc.stdout is not None timestamp = 0 await async_write_event( AudioStart(rate, width, channels, timestamp=timestamp).event(), wake_proc.stdin, ) async def next_chunk(): """Get the next chunk from audio stream.""" async for chunk_bytes in audio_stream: return chunk_bytes audio_task = asyncio.create_task(next_chunk()) wake_task = asyncio.create_task(async_read_event(wake_proc.stdout)) pending = {audio_task, wake_task} while True: done, pending = await asyncio.wait( pending, return_when=asyncio.FIRST_COMPLETED ) if audio_task in done: chunk_bytes = audio_task.result() if chunk_bytes: chunk = AudioChunk(rate, width, channels, chunk_bytes) await async_write_event(chunk.event(), wake_proc.stdin) timestamp += chunk.milliseconds audio_task = asyncio.create_task(next_chunk()) pending.add(audio_task) else: wake_task.cancel() await async_write_event(AudioStop().event(), wake_proc.stdin) wake_task = asyncio.create_task(async_read_event(wake_proc.stdout)) pending = {wake_task} if wake_task in done: wake_event = wake_task.result() if wake_event is None: break if Detection.is_type(wake_event.type): detection = Detection.from_event(wake_event) _LOGGER.debug("detect: %s", detection) return detection if NotDetected.is_type(wake_event.type): break wake_task = asyncio.create_task(async_read_event(wake_proc.stdout)) pending.add(wake_task) _LOGGER.debug("Not detected") return None ================================================ FILE: rhasspy3_http_api/__init__.py ================================================ ================================================ FILE: rhasspy3_http_api/__main__.py ================================================ import argparse import asyncio import logging import os import subprocess import threading from pathlib import Path from typing import Tuple from uuid import uuid4 import hypercorn import quart_cors from quart import Quart, Response, jsonify, render_template, send_from_directory from rhasspy3.audio import DEFAULT_SAMPLES_PER_CHUNK from rhasspy3.core import Rhasspy from .asr import add_asr from .handle import add_handle from .intent import add_intent from .pipeline import add_pipeline from .snd import add_snd from .tts import add_tts from .wake import add_wake _DIR = Path(__file__).parent _LOGGER = logging.getLogger("rhasspy") def main(): parser = argparse.ArgumentParser() parser.add_argument( "-c", "--config", default=_DIR.parent / "config", help="Configuration directory", ) parser.add_argument( "--pipeline", default="default", help="Name of default pipeline to run" ) parser.add_argument( "--server", nargs=2, action="append", metavar=("domain", "name"), help="Domain/name of server(s) to run", ) parser.add_argument( "--host", default="0.0.0.0", help="Host of HTTP server (default: 0.0.0.0)" ) parser.add_argument( "--port", type=int, default=13331, help="Port of HTTP server (default: 13331)" ) parser.add_argument( "--samples-per-chunk", type=int, default=DEFAULT_SAMPLES_PER_CHUNK ) parser.add_argument("--asr-chunks-to-buffer", type=int, default=0) parser.add_argument("--debug", action="store_true", help="Log DEBUG messages") args = parser.parse_args() logging.basicConfig(level=logging.DEBUG if args.debug else logging.INFO) rhasspy = Rhasspy.load(args.config) pipeline = rhasspy.config.pipelines[args.pipeline] template_dir = _DIR / "templates" img_dir = _DIR / "img" css_dir = _DIR / "css" js_dir = _DIR / "js" app = Quart("rhasspy3", template_folder=str(template_dir)) app.secret_key = str(uuid4()) # Monkey patch quart_cors to get rid of non-standard requirement that # websockets have origin header set. def _apply_websocket_cors(*args, **kwargs): """Allow null origin.""" pass # pylint: disable=protected-access quart_cors._apply_websocket_cors = _apply_websocket_cors app = quart_cors.cors(app, allow_origin="*") add_wake(app, rhasspy, pipeline, args) add_asr(app, rhasspy, pipeline, args) add_intent(app, rhasspy, pipeline, args) add_handle(app, rhasspy, pipeline, args) add_snd(app, rhasspy, pipeline, args) add_tts(app, rhasspy, pipeline, args) add_pipeline(app, rhasspy, pipeline, args) @app.errorhandler(Exception) async def handle_error(err) -> Tuple[str, int]: """Return error as text.""" _LOGGER.exception(err) return (f"{err.__class__.__name__}: {err}", 500) @app.route("/", methods=["GET"]) async def page_index() -> str: """Render main web page.""" return await render_template("index.html", config=rhasspy.config) @app.route("/satellite.html", methods=["GET"]) async def page_satellite() -> str: """Render satellite web page.""" return await render_template("satellite.html", config=rhasspy.config) @app.route("/img/", methods=["GET"]) async def img(filename) -> Response: """Image static endpoint.""" return await send_from_directory(img_dir, filename) @app.route("/css/", methods=["GET"]) async def css(filename) -> Response: """css static endpoint.""" return await send_from_directory(css_dir, filename) @app.route("/js/", methods=["GET"]) async def js(filename) -> Response: """Javascript static endpoint.""" return await send_from_directory(js_dir, filename) @app.route("/config", methods=["GET"]) async def http_config() -> Response: return jsonify(rhasspy.config) @app.route("/version", methods=["POST"]) async def http_version() -> str: return "3.0.0" hyp_config = hypercorn.config.Config() hyp_config.bind = [f"{args.host}:{args.port}"] if args.server: run_servers(rhasspy, args.server) try: asyncio.run(hypercorn.asyncio.serve(app, hyp_config)) except KeyboardInterrupt: pass # ----------------------------------------------------------------------------- def run_servers(rhasspy, servers): def run_server(domain: str, name: str): try: command = [ "server_run.py", "--config", str(rhasspy.config_dir), domain, name, ] env = dict(os.environ) env["PATH"] = f'{rhasspy.base_dir}/bin:{env["PATH"]}' _LOGGER.debug(command) _LOGGER.info("Starting %s %s", domain, name) subprocess.run(command, check=True, cwd=rhasspy.base_dir, env=env) except Exception: _LOGGER.exception( "Unexpected error running server: domain=%s, name=%s", domain, name ) for domain, server_name in servers: threading.Thread( target=run_server, args=(domain, server_name), daemon=True ).start() # ----------------------------------------------------------------------------- if __name__ == "__main__": main() ================================================ FILE: rhasspy3_http_api/asr.py ================================================ import argparse import io import json import logging from quart import Quart, Response, jsonify, render_template, request, websocket from rhasspy3.asr import transcribe, transcribe_stream from rhasspy3.audio import ( DEFAULT_IN_CHANNELS, DEFAULT_IN_RATE, DEFAULT_IN_WIDTH, AudioStop, ) from rhasspy3.config import PipelineConfig from rhasspy3.core import Rhasspy from rhasspy3.event import Event _LOGGER = logging.getLogger(__name__) def add_asr( app: Quart, rhasspy: Rhasspy, pipeline: PipelineConfig, args: argparse.Namespace ) -> None: @app.route("/asr.html", methods=["GET"]) async def http_asr() -> str: return await render_template("asr.html", config=rhasspy.config) @app.route("/asr/transcribe", methods=["POST"]) async def http_asr_transcribe() -> Response: """Transcribe a WAV file.""" wav_bytes = await request.data asr_pipeline = ( rhasspy.config.pipelines[request.args["pipeline"]] if "pipeline" in request.args else pipeline ) asr_program = request.args.get("asr_program") or asr_pipeline.asr assert asr_program, "Missing program for asr" samples_per_chunk = int( request.args.get("samples_per_chunk", args.samples_per_chunk) ) _LOGGER.debug("transcribe: asr=%s, wav=%s byte(s)", asr_program, len(wav_bytes)) with io.BytesIO(wav_bytes) as wav_in: transcript = await transcribe( rhasspy, asr_program, wav_in, samples_per_chunk ) _LOGGER.debug("transcribe: transcript='%s'", transcript) return jsonify(transcript.event().to_dict() if transcript is not None else {}) @app.websocket("/asr/transcribe") async def ws_asr_transcribe(): """Transcribe a websocket audio stream.""" asr_pipeline = ( rhasspy.config.pipelines[websocket.args["pipeline"]] if "pipeline" in websocket.args else pipeline ) asr_program = websocket.args.get("asr_program") or asr_pipeline.asr assert asr_program, "Missing program for asr" vad_program = websocket.args.get("vad_program") or asr_pipeline.vad assert vad_program, "Missing program for vad" rate = int(websocket.args.get("rate", DEFAULT_IN_RATE)) width = int(websocket.args.get("width", DEFAULT_IN_WIDTH)) channels = int(websocket.args.get("channels", DEFAULT_IN_CHANNELS)) _LOGGER.debug( "transcribe: asr=%s, vad=%s, rate=%s, width=%s, channels=%s", asr_program, vad_program, rate, width, channels, ) async def audio_stream(): while True: data = await websocket.receive() if not data: # Empty message signals stop break if isinstance(data, bytes): # Raw audio yield data else: event = Event.from_dict(json.loads(data)) if AudioStop.is_type(event.type): # Stop event break transcript = await transcribe_stream( rhasspy, asr_program, vad_program, audio_stream(), rate, width, channels ) _LOGGER.debug("transcribe: transcript='%s'", transcript) await websocket.send_json( transcript.event().to_dict() if transcript is not None else {} ) ================================================ FILE: rhasspy3_http_api/css/main.css ================================================ #header { border-bottom: 1px solid black; } li { padding-bottom: 1rem; } ================================================ FILE: rhasspy3_http_api/handle.py ================================================ import argparse import json import logging from typing import Optional, Union from quart import Quart, Response, jsonify, request from rhasspy3.asr import Transcript from rhasspy3.config import PipelineConfig from rhasspy3.core import Rhasspy from rhasspy3.event import Event from rhasspy3.handle import handle from rhasspy3.intent import Intent, NotRecognized _LOGGER = logging.getLogger(__name__) _HANDLE_INPUT_TYPES = (Transcript, Intent, NotRecognized) def add_handle( app: Quart, rhasspy: Rhasspy, pipeline: PipelineConfig, args: argparse.Namespace ) -> None: @app.route("/handle/handle", methods=["GET", "POST"]) async def http_handle_handle() -> Response: """Handle text or intent JSON.""" if request.method == "GET": data = request.args["input"] else: data = (await request.data).decode() handle_pipeline = ( rhasspy.config.pipelines[request.args["pipeline"]] if "pipeline" in request.args else pipeline ) # Input can be plain text or a JSON intent handle_input: Optional[Union[Intent, NotRecognized, Transcript]] = None if request.content_type == "application/json": # Try to parse either an "intent" or "not-recognized" event event = Event(json.loads(data)) for event_class in _HANDLE_INPUT_TYPES: assert issubclass(event_class, _HANDLE_INPUT_TYPES) if event_class.is_type(event.type): handle_input = event_class.from_event(event) else: # Assume plain text handle_input = Transcript(data) assert handle_input is not None, "Invalid input" handle_program = request.args.get("handle_program") or handle_pipeline.handle assert handle_program is not None, "Missing program for handle" _LOGGER.debug("handle: handle=%s, input='%s'", handle_program, handle_input) result = await handle(rhasspy, handle_program, handle_input) _LOGGER.debug("handle: result=%s", result) return jsonify(result.event().to_dict() if result is not None else {}) ================================================ FILE: rhasspy3_http_api/intent.py ================================================ import argparse import logging from quart import Quart, Response, jsonify, request from rhasspy3.config import PipelineConfig from rhasspy3.core import Rhasspy from rhasspy3.intent import recognize _LOGGER = logging.getLogger(__name__) def add_intent( app: Quart, rhasspy: Rhasspy, pipeline: PipelineConfig, args: argparse.Namespace ) -> None: @app.route("/intent/recognize", methods=["GET", "POST"]) async def http_intent_recognize() -> Response: """Recognize intent from text.""" if request.method == "GET": text = request.args["text"] else: text = (await request.data).decode() intent_pipeline = ( rhasspy.config.pipelines[request.args["pipeline"]] if "pipeline" in request.args else pipeline ) intent_program = request.args.get("intent_program") or intent_pipeline.intent assert intent_program, "Missing program for intent" _LOGGER.debug("recognize: intent=%s, text='%s'", intent_program, text) result = await recognize(rhasspy, intent_program, text) _LOGGER.debug("recognize: result=%s", result) return jsonify(result.event().to_dict() if result is not None else {}) ================================================ FILE: rhasspy3_http_api/js/main.js ================================================ function q(selector) { return document.querySelector(selector); } // https://stackoverflow.com/questions/62093473/how-to-play-raw-audio-files function buildWaveHeader(opts) { const numFrames = opts.numFrames; const numChannels = opts.numChannels || 2; const sampleRate = opts.sampleRate || 44100; const bytesPerSample = opts.bytesPerSample || 2; const format = opts.format const blockAlign = numChannels * bytesPerSample; const byteRate = sampleRate * blockAlign; const dataSize = numFrames * blockAlign; const buffer = new ArrayBuffer(44); const dv = new DataView(buffer); let p = 0; function writeString(s) { for (let i = 0; i < s.length; i++) { dv.setUint8(p + i, s.charCodeAt(i)); } p += s.length; } function writeUint32(d) { dv.setUint32(p, d, true); p += 4; } function writeUint16(d) { dv.setUint16(p, d, true); p += 2; } writeString('RIFF'); // ChunkID writeUint32(dataSize + 36); // ChunkSize writeString('WAVE'); // Format writeString('fmt '); // Subchunk1ID writeUint32(16); // Subchunk1Size writeUint16(format); // AudioFormat writeUint16(numChannels); // NumChannels writeUint32(sampleRate); // SampleRate writeUint32(byteRate); // ByteRate writeUint16(blockAlign); // BlockAlign writeUint16(bytesPerSample * 8); // BitsPerSample writeString('data'); // Subchunk2ID writeUint32(dataSize); // Subchunk2Size return buffer; } ================================================ FILE: rhasspy3_http_api/js/recorder.worklet.js ================================================ class RecorderProcessor extends AudioWorkletProcessor { constructor() { super(); } process(inputList, outputList, parameters) { if (inputList[0].length < 1) { return true; } const float32Data = inputList[0][0]; const int16Data = new Int16Array(float32Data.length); for (let i = 0; i < float32Data.length; i++) { const s = Math.max(-1, Math.min(1, float32Data[i])); int16Data[i] = s < 0 ? s * 0x8000 : s * 0x7fff; } this.port.postMessage(int16Data); return true; } }; registerProcessor("recorder.worklet", RecorderProcessor); ================================================ FILE: rhasspy3_http_api/pipeline.py ================================================ import argparse import asyncio import io import logging from enum import Enum from typing import IO, Optional, Union from quart import Quart, Response, jsonify, render_template, request, websocket from rhasspy3.asr import DOMAIN as ASR_DOMAIN from rhasspy3.asr import Transcript from rhasspy3.audio import ( DEFAULT_IN_CHANNELS, DEFAULT_IN_RATE, DEFAULT_IN_WIDTH, DEFAULT_OUT_CHANNELS, DEFAULT_OUT_RATE, DEFAULT_OUT_WIDTH, AudioChunk, AudioChunkConverter, AudioStart, AudioStop, ) from rhasspy3.config import PipelineConfig from rhasspy3.core import Rhasspy from rhasspy3.event import Event, async_read_event, async_write_event from rhasspy3.handle import Handled, NotHandled, handle from rhasspy3.intent import Intent, NotRecognized from rhasspy3.pipeline import StopAfterDomain from rhasspy3.pipeline import run as run_pipeline from rhasspy3.program import create_process from rhasspy3.tts import synthesize_stream from rhasspy3.vad import DOMAIN as VAD_DOMAIN from rhasspy3.vad import VoiceStarted, VoiceStopped from rhasspy3.wake import Detection _LOGGER = logging.getLogger(__name__) class StartAfterDomain(str, Enum): WAKE = "wake" ASR = "asr" INTENT = "intent" HANDLE = "handle" TTS = "tts" def add_pipeline( app: Quart, rhasspy: Rhasspy, pipeline: PipelineConfig, args: argparse.Namespace ) -> None: @app.route("/pipeline.html", methods=["GET"]) async def http_pipeline() -> str: return await render_template("pipeline.html", config=rhasspy.config) @app.route("/pipeline/run", methods=["POST"]) async def http_pipeline_run() -> Response: running_pipeline = ( rhasspy.config.pipelines[request.args["pipeline"]] if "pipeline" in request.args else pipeline ) mic_program = request.args.get("mic_program") or running_pipeline.mic wake_program = request.args.get("wake_program") or running_pipeline.wake vad_program = request.args.get("vad_program") or running_pipeline.vad asr_program = request.args.get("asr_program") or running_pipeline.asr intent_program = request.args.get("intent_program") or running_pipeline.intent handle_program = request.args.get("handle_program") or running_pipeline.handle tts_program = request.args.get("tts_program") or running_pipeline.tts snd_program = request.args.get("snd_program") or running_pipeline.snd # start_after = request.args.get("start_after") stop_after = request.args.get("stop_after") # samples_per_chunk = int( request.args.get("samples_per_chunk", args.samples_per_chunk) ) asr_chunks_to_buffer = int( request.args.get("asr_chunks_to_buffer", args.asr_chunks_to_buffer) ) _LOGGER.debug( "run: " "mic=%s," "wake=%s," "vad=%s," "asr=%s," "intent=%s," "handle=%s," "tts=%s," "snd=%s," "start_after=%s " "stop_after=%s", mic_program, wake_program, vad_program, asr_program, intent_program, handle_program, tts_program, snd_program, start_after, stop_after, ) wake_detection: Optional[Detection] = None asr_wav_in: Optional[IO[bytes]] = None asr_transcript: Optional[Transcript] = None intent_result: Optional[Union[Intent, NotRecognized]] = None handle_result: Optional[Union[Handled, NotHandled]] = None tts_wav_in: Optional[IO[bytes]] = None if start_after: # Determine where to start in the pipeline start_after = StartAfterDomain(start_after) if start_after == StartAfterDomain.WAKE: # Body is detected wake name name = (await request.data).decode() wake_detection = Detection(name=name) elif start_after == StartAfterDomain.ASR: # Body is transcript or WAV if request.content_type == "audio/wav": wav_bytes = await request.data asr_wav_in = io.BytesIO(wav_bytes) else: text = (await request.data).decode() asr_transcript = Transcript(text=text) elif start_after == StartAfterDomain.INTENT: # Body is JSON event = Event.from_dict(await request.json) if Intent.is_type(event.type): intent_result = Intent.from_event(event) elif NotRecognized.is_type(event.type): intent_result = NotRecognized.from_event(event) else: raise ValueError(f"Unexpected event type: {event.type}") elif start_after == StartAfterDomain.HANDLE: # Body is text or JSON if request.content_type == "application/json": event = Event.from_dict(await request.json) if Handled.is_type(event.type): handle_result = Handled.from_event(event) elif NotRecognized.is_type(event.type): handle_result = NotHandled.from_event(event) else: raise ValueError(f"Unexpected event type: {event.type}") else: # Plain text text = (await request.data).decode() handle_result = Handled(text=text) elif start_after == StartAfterDomain.TTS: # Body is or WAV wav_bytes = await request.data tts_wav_in = io.BytesIO(wav_bytes) pipeline_result = await run_pipeline( rhasspy, pipeline, samples_per_chunk, asr_chunks_to_buffer=asr_chunks_to_buffer, mic_program=mic_program, wake_program=wake_program, wake_detection=wake_detection, asr_program=asr_program, asr_transcript=asr_transcript, asr_wav_in=asr_wav_in, vad_program=vad_program, intent_program=intent_program, intent_result=intent_result, handle_program=handle_program, handle_result=handle_result, tts_program=tts_program, tts_wav_in=tts_wav_in, snd_program=snd_program, stop_after=StopAfterDomain(stop_after) if stop_after else None, ) return jsonify(pipeline_result.to_event_dict()) @app.websocket("/pipeline/asr-tts") async def ws_api_asr_tts() -> None: running_pipeline = ( rhasspy.config.pipelines[websocket.args["pipeline"]] if "pipeline" in websocket.args else pipeline ) asr_program = websocket.args.get("asr_program") or running_pipeline.asr assert asr_program, "Missing asr program" vad_program = websocket.args.get("vad_program") or running_pipeline.vad assert vad_program, "Missing vad program" handle_program = websocket.args.get("handle_program") or running_pipeline.handle assert handle_program, "Missing handle program" tts_program = websocket.args.get("tts_program") or running_pipeline.tts assert tts_program, "Missing tts program" in_rate = int(websocket.args.get("in_rate", DEFAULT_IN_RATE)) in_width = int(websocket.args.get("in_width", DEFAULT_IN_WIDTH)) in_channels = int(websocket.args.get("in_channels", DEFAULT_IN_CHANNELS)) out_rate = int(websocket.args.get("out_rate", DEFAULT_OUT_RATE)) out_width = int(websocket.args.get("out_width", DEFAULT_OUT_WIDTH)) out_channels = int(websocket.args.get("out_channels", DEFAULT_OUT_CHANNELS)) # asr + vad async with ( await create_process(rhasspy, ASR_DOMAIN, asr_program) ) as asr_proc, ( await create_process(rhasspy, VAD_DOMAIN, vad_program) ) as vad_proc: assert asr_proc.stdin is not None assert asr_proc.stdout is not None assert vad_proc.stdin is not None assert vad_proc.stdout is not None mic_task = asyncio.create_task(websocket.receive()) vad_task = asyncio.create_task(async_read_event(vad_proc.stdout)) pending = {mic_task, vad_task} while True: done, pending = await asyncio.wait( pending, return_when=asyncio.FIRST_COMPLETED ) if mic_task in done: audio_bytes = mic_task.result() if isinstance(audio_bytes, bytes) and audio_bytes: mic_chunk = AudioChunk( in_rate, in_width, in_channels, audio_bytes ) mic_chunk_event = mic_chunk.event() await asyncio.gather( async_write_event(mic_chunk_event, asr_proc.stdin), async_write_event(mic_chunk_event, vad_proc.stdin), ) mic_task = asyncio.create_task(websocket.receive()) pending.add(mic_task) if vad_task in done: vad_event = vad_task.result() if vad_event is None: break # Forward to websocket await websocket.send_json(vad_event.to_dict()) if VoiceStarted.is_type(vad_event.type): _LOGGER.debug("stream-to-stream: voice started") elif VoiceStopped.is_type(vad_event.type): _LOGGER.debug("stream-to-stream: voice stopped") break vad_task = asyncio.create_task(async_read_event(vad_proc.stdout)) pending.add(vad_task) # Get transcript from asr await async_write_event(AudioStop().event(), asr_proc.stdin) transcript: Optional[Transcript] = None while True: asr_event = await async_read_event(asr_proc.stdout) if asr_event is None: break # Forward to websocket await websocket.send_json(asr_event.to_dict()) if Transcript.is_type(asr_event.type): transcript = Transcript.from_event(asr_event) _LOGGER.debug("stream-to-stream: asr=%s", transcript) break handle_result: Optional[Union[Handled, NotHandled]] = None if transcript is not None: handle_result = await handle(rhasspy, handle_program, transcript) _LOGGER.debug("stream-to-stream: handle=%s", handle_result) if (handle_result is not None) and handle_result.text: # Forward to websocket await websocket.send_json(handle_result.event().to_dict()) _LOGGER.debug("stream-to-stream: sending tts") await websocket.send_json( AudioStart(out_rate, out_width, out_channels).event().to_dict() ) converter = AudioChunkConverter(out_rate, out_width, out_channels) async for tts_chunk in synthesize_stream( rhasspy, tts_program, handle_result.text ): tts_chunk = converter.convert(tts_chunk) await websocket.send(tts_chunk.audio) _LOGGER.debug("stream-to-stream: tts done") await websocket.send_json(AudioStop().event().to_dict()) ================================================ FILE: rhasspy3_http_api/snd.py ================================================ import argparse import io import json import logging from quart import Quart, Response, jsonify, request, websocket from rhasspy3.audio import ( DEFAULT_OUT_CHANNELS, DEFAULT_OUT_RATE, DEFAULT_OUT_WIDTH, AudioStop, ) from rhasspy3.config import PipelineConfig from rhasspy3.core import Rhasspy from rhasspy3.event import Event from rhasspy3.snd import play, play_stream _LOGGER = logging.getLogger(__name__) def add_snd( app: Quart, rhasspy: Rhasspy, pipeline: PipelineConfig, args: argparse.Namespace ) -> None: @app.route("/snd/play", methods=["POST"]) async def http_snd_play() -> Response: """Play WAV file.""" wav_bytes = await request.data snd_pipeline = ( rhasspy.config.pipelines[request.args["pipeline"]] if "pipeline" in request.args else pipeline ) snd_program = request.args.get("snd_program") or snd_pipeline.snd assert snd_program, "Missing program for snd" samples_per_chunk = int( request.args.get("samples_per_chunk", args.samples_per_chunk) ) _LOGGER.debug("play: snd=%s, wav=%s byte(s)", snd_program, len(wav_bytes)) with io.BytesIO(wav_bytes) as wav_in: played = await play( rhasspy, snd_program, wav_in, samples_per_chunk, ) return jsonify(played.event().to_dict() if played is not None else {}) @app.websocket("/snd/play") async def ws_snd_play(): """Play websocket audio stream.""" snd_pipeline = ( rhasspy.config.pipelines[request.args["pipeline"]] if "pipeline" in request.args else pipeline ) snd_program = websocket.args.get("snd_program") or snd_pipeline.snd assert snd_program, "Missing program for snd" rate = int(websocket.args.get("rate", DEFAULT_OUT_RATE)) width = int(websocket.args.get("width", DEFAULT_OUT_WIDTH)) channels = int(websocket.args.get("channels", DEFAULT_OUT_CHANNELS)) _LOGGER.debug("play: snd=%s", snd_program) async def audio_stream(): while True: data = await websocket.receive() if not data: # Empty message signals stop break if isinstance(data, bytes): # Raw audio yield data else: event = Event.from_dict(json.loads(data)) if AudioStop.is_type(event.type): # Stop event break played = await play_stream( rhasspy, snd_program, audio_stream(), rate, width, channels ) await websocket.send_json( played.event().to_dict() if played is not None else {} ) ================================================ FILE: rhasspy3_http_api/templates/asr.html ================================================ {% extends "layout.html" %} {% block body %}

Speech to Text (asr)

Transcribe

  1. Pipeline:
  2. Mic Program:
  3. VAD Program:
  4. ASR Program:
  5. Status:
  6. Transcript:
{% endblock %} ================================================ FILE: rhasspy3_http_api/templates/index.html ================================================ {% extends "layout.html" %} {% block body %} {% endblock %} ================================================ FILE: rhasspy3_http_api/templates/layout.html ================================================ Rhasspy {% block body %}{% endblock %} ================================================ FILE: rhasspy3_http_api/templates/pipeline.html ================================================ {% extends "layout.html" %} {% block body %}

Pipeline

  1. Pipeline:
  2. Mic Program:
  3. Wake Program:
  4. ASR Program:
  5. VAD Program:
  6. Intent Program:
  7. Handle Program:
  8. TTS Program:
  9. Snd Program:
  10. starting after stopping after
  11. Status:
  12. Result:
{% endblock %} ================================================ FILE: rhasspy3_http_api/templates/satellite.html ================================================ {% extends "layout.html" %} {% block body %}

Satellite

  1. Pipeline:
  2. ASR Program:
  3. VAD Program:
  4. Handle Program:
  5. TTS Program:
  6. Status:
  7. Events:
{% endblock %} ================================================ FILE: rhasspy3_http_api/templates/tts.html ================================================ {% extends "layout.html" %} {% block body %}

Text to Speech (tts)

Speak

  1. Text:
  2. Pipeline:
  3. TTS Program:
  4. Snd Program:
  5. Status:
{% endblock %} ================================================ FILE: rhasspy3_http_api/tts.py ================================================ import argparse import io import logging from quart import Quart, Response, jsonify, render_template, request from rhasspy3.config import PipelineConfig from rhasspy3.core import Rhasspy from rhasspy3.snd import play from rhasspy3.tts import synthesize _LOGGER = logging.getLogger(__name__) def add_tts( app: Quart, rhasspy: Rhasspy, pipeline: PipelineConfig, args: argparse.Namespace ) -> None: @app.route("/tts.html", methods=["GET"]) async def http_tts() -> str: return await render_template("tts.html", config=rhasspy.config) @app.route("/tts/synthesize", methods=["GET", "POST"]) async def http_tts_synthesize() -> Response: """Synthesize a WAV file from text.""" if request.method == "GET": text = request.args["text"] else: text = (await request.data).decode() tts_pipeline = ( rhasspy.config.pipelines[request.args["pipeline"]] if "pipeline" in request.args else pipeline ) tts_program = request.args.get("tts_program") or tts_pipeline.tts assert tts_program, "No tts program" _LOGGER.debug("synthesize: tts=%s, text='%s'", tts_program, text) with io.BytesIO() as wav_out: await synthesize(rhasspy, tts_program, text, wav_out) wav_bytes = wav_out.getvalue() _LOGGER.debug("synthesize: wav=%s byte(s)", len(wav_bytes)) return Response(wav_bytes, mimetype="audio/wav") @app.route("/tts/speak", methods=["GET", "POST"]) async def http_tts_speak() -> Response: """Synthesize audio from text and play.""" if request.method == "GET": text = request.args["text"] else: text = (await request.data).decode() tts_pipeline = ( rhasspy.config.pipelines[request.args["pipeline"]] if "pipeline" in request.args else pipeline ) tts_program = request.args.get("tts_program") or tts_pipeline.tts snd_program = request.args.get("snd_program") or tts_pipeline.snd samples_per_chunk = int( request.args.get("samples_per_chunk", args.samples_per_chunk) ) assert tts_program, "No tts program" assert snd_program, "No snd program" _LOGGER.debug( "synthesize: tts=%s, snd=%s, text='%s'", tts_program, snd_program, text ) with io.BytesIO() as wav_out: await synthesize(rhasspy, tts_program, text, wav_out) wav_bytes = wav_out.getvalue() _LOGGER.debug("synthesize: wav=%s byte(s)", len(wav_bytes)) wav_out.seek(0) played = await play(rhasspy, snd_program, wav_out, samples_per_chunk) return jsonify(played.event().to_dict() if played is not None else {}) ================================================ FILE: rhasspy3_http_api/wake.py ================================================ import argparse import io import json import logging import wave from quart import Quart, Response, jsonify, request, websocket from rhasspy3.audio import ( DEFAULT_IN_CHANNELS, DEFAULT_IN_RATE, DEFAULT_IN_WIDTH, AudioStop, ) from rhasspy3.config import PipelineConfig from rhasspy3.core import Rhasspy from rhasspy3.event import Event from rhasspy3.mic import DOMAIN as MIC_DOMAIN from rhasspy3.program import create_process from rhasspy3.wake import detect, detect_stream _LOGGER = logging.getLogger(__name__) def add_wake( app: Quart, rhasspy: Rhasspy, pipeline: PipelineConfig, args: argparse.Namespace ) -> None: @app.route("/wake/detect", methods=["GET", "POST"]) async def http_wake_detect() -> Response: """Detect wake word in WAV file.""" wav_bytes = await request.data wake_pipeline = ( rhasspy.config.pipelines[request.args["pipeline"]] if "pipeline" in request.args else pipeline ) wake_program = request.args.get("wake_program") or wake_pipeline.wake assert wake_program, "Missing program for wake" if wav_bytes: # Detect from WAV samples_per_chunk = int( request.args.get("samples_per_chunk", args.samples_per_chunk) ) _LOGGER.debug( "detect: wake=%s, wav=%s byte(s)", wake_program, len(wav_bytes) ) with io.BytesIO(wav_bytes) as wav_io: wav_file: wave.Wave_read = wave.open(wav_io, "rb") with wav_file: async def audio_stream(): chunk = wav_file.readframes(samples_per_chunk) while chunk: yield chunk chunk = wav_file.readframes(samples_per_chunk) detection = await detect_stream( rhasspy, wake_program, audio_stream(), wav_file.getframerate(), wav_file.getsampwidth(), wav_file.getnchannels(), ) else: # Detect from mic mic_program = request.args.get("mic_program") or wake_pipeline.mic assert mic_program, "Missing program for mic" _LOGGER.debug("detect: mic=%s, wake=%s", mic_program, wake_program) async with ( await create_process(rhasspy, MIC_DOMAIN, mic_program) ) as mic_proc: assert mic_proc.stdout is not None detection = await detect(rhasspy, wake_program, mic_proc.stdout) _LOGGER.debug("wake: detection=%s", detection) return jsonify(detection.event().to_dict() if detection is not None else {}) @app.websocket("/wake/detect") async def ws_wake_detect(): """Detect wake word in websocket audio stream.""" wake_pipeline = ( rhasspy.config.pipelines[request.args["pipeline"]] if "pipeline" in request.args else pipeline ) wake_program = websocket.args.get("wake_program") or wake_pipeline.wake assert wake_program, "Missing program for wake" rate = int(websocket.args.get("rate", DEFAULT_IN_RATE)) width = int(websocket.args.get("width", DEFAULT_IN_WIDTH)) channels = int(websocket.args.get("channels", DEFAULT_IN_CHANNELS)) _LOGGER.debug("detect: wake=%s", wake_program) async def audio_stream(): while True: data = await websocket.receive() if not data: # Empty message signals stop break if isinstance(data, bytes): # Raw audio yield data else: event = Event.from_dict(json.loads(data)) if AudioStop.is_type(event.type): # Stop event break detection = await detect_stream( rhasspy, wake_program, audio_stream(), rate, width, channels ) _LOGGER.debug("detect: detection='%s'", detection) await websocket.send_json( detection.event().to_dict() if detection is not None else {} ) ================================================ FILE: script/format ================================================ #!/usr/bin/env bash set -eo pipefail # Directory of *this* script this_dir="$( cd "$( dirname "$0" )" && pwd )" base_dir="$(realpath "${this_dir}/..")" # Path to virtual environment : "${venv:=${base_dir}/.venv}" if [ -d "${venv}" ]; then # Activate virtual environment if available source "${venv}/bin/activate" fi python_files=() python_files+=("${base_dir}/bin") python_files+=("${base_dir}/rhasspy3") python_files+=("${base_dir}/rhasspy3_http_api") python_files+=("${base_dir}/programs") # Format code black "${python_files[@]}" isort "${python_files[@]}" ================================================ FILE: script/http_server ================================================ #!/usr/bin/env bash set -eo pipefail # Directory of *this* script this_dir="$( cd "$( dirname "$0" )" && pwd )" # Base directory of repo base_dir="$(realpath "${this_dir}/..")" # Path to virtual environment : "${venv:=${base_dir}/.venv}" if [ -d "${venv}" ]; then # Activate virtual environment if available source "${venv}/bin/activate" fi export PYTHONPATH="${base_dir}" export PATH="${base_dir}/bin:${PATH}" python3 -m rhasspy3_http_api "$@" ================================================ FILE: script/lint ================================================ #!/usr/bin/env bash set -eo pipefail # Directory of *this* script this_dir="$( cd "$( dirname "$0" )" && pwd )" base_dir="$(realpath "${this_dir}/..")" # Path to virtual environment : "${venv:=${base_dir}/.venv}" if [ -d "${venv}" ]; then # Activate virtual environment if available source "${venv}/bin/activate" fi python_files=() python_files+=("${base_dir}/bin") python_files+=("${base_dir}/rhasspy3") python_files+=("${base_dir}/rhasspy3_http_api") python_files+=("${base_dir}/programs") # Check black "${python_files[@]}" --check isort "${python_files[@]}" --check flake8 "${python_files[@]}" pylint "${python_files[@]}" mypy "${base_dir}/bin" "${base_dir}/rhasspy3" ================================================ FILE: script/run ================================================ #!/usr/bin/env bash set -eo pipefail # Directory of *this* script this_dir="$( cd "$( dirname "$0" )" && pwd )" # Base directory of repo base_dir="$(realpath "${this_dir}/..")" # Path to virtual environment : "${venv:=${base_dir}/.venv}" if [ -d "${venv}" ]; then # Activate virtual environment if available source "${venv}/bin/activate" fi export PYTHONPATH="${base_dir}" export PYTHONUNBUFFERED='1' export PATH="${base_dir}/bin:${PATH}" "$@" ================================================ FILE: script/setup ================================================ #!/usr/bin/env bash set -eo pipefail # Directory of *this* script this_dir="$( cd "$( dirname "$0" )" && pwd )" # Base directory of repo base_dir="$(realpath "${this_dir}/..")" # Copy default config config_dir="${base_dir}/config" mkdir -p "${config_dir}" # ----------------------------------------------------------------------------- echo "OK" ================================================ FILE: script/setup_http_server ================================================ #!/usr/bin/env bash set -eo pipefail # Directory of *this* script this_dir="$( cd "$( dirname "$0" )" && pwd )" # Base directory of repo base_dir="$(realpath "${this_dir}/..")" # Path to virtual environment : "${venv:=${base_dir}/.venv}" # Python binary to use : "${PYTHON=python3}" python_version="$(${PYTHON} --version)" # Create virtual environment if [ ! -d "${venv}" ]; then echo "Creating virtual environment at ${venv} (${python_version})" rm -rf "${venv}" "${PYTHON}" -m venv "${venv}" source "${venv}/bin/activate" # Install Python dependencies echo 'Installing Python dependencies' pip3 install --upgrade pip pip3 install --upgrade wheel setuptools else source "${venv}/bin/activate" fi if [ -f "${base_dir}/requirements.txt" ]; then pip3 install -r "${base_dir}/requirements.txt" fi pip3 install -r "${base_dir}/requirements_http_api.txt" # ----------------------------------------------------------------------------- echo "OK" ================================================ FILE: script/test ================================================ #!/usr/bin/env bash set -eo pipefail # Directory of *this* script this_dir="$( cd "$( dirname "$0" )" && pwd )" # Base directory of repo base_dir="$(realpath "${this_dir}/..")" # Path to virtual environment : "${venv:=${base_dir}/.venv}" if [ -d "${venv}" ]; then # Activate virtual environment if available source "${venv}/bin/activate" fi export PYTHONPATH="${base_dir}:${PYTHONPATH}" pytest -vv "${base_dir}/tests" ================================================ FILE: setup.cfg ================================================ [flake8] # To work with Black max-line-length = 88 # E501: line too long # W503: Line break occurred before a binary operator # E203: Whitespace before ':' # D202 No blank lines allowed after function docstring # W504 line break after binary operator ignore = E501, W503, E203, D202, W504 # F401 import unused per-file-ignores = programs/asr/faster-whisper/src/faster_whisper/__init__.py:F401 [isort] multi_line_output = 3 include_trailing_comma=True force_grid_wrap=0 use_parentheses=True line_length=88 indent = " " ================================================ FILE: setup.py ================================================ #!/usr/bin/env python3 from pathlib import Path import setuptools from setuptools import setup this_dir = Path(__file__).parent module_dir = this_dir / "rhasspy3" # ----------------------------------------------------------------------------- # Load README in as long description long_description: str = "" readme_path = this_dir / "README.md" if readme_path.is_file(): long_description = readme_path.read_text(encoding="utf-8") requirements = [] requirements_path = this_dir / "requirements.txt" if requirements_path.is_file(): with open(requirements_path, "r", encoding="utf-8") as requirements_file: requirements = requirements_file.read().splitlines() version_path = module_dir / "VERSION" with open(version_path, "r", encoding="utf-8") as version_file: version = version_file.read().strip() # ----------------------------------------------------------------------------- setup( name="rhasspy3", version=version, description="Rhasspy Voice Assistant Toolkit", long_description=long_description, url="http://github.com/rhasspy/rhasspy3", author="Michael Hansen", author_email="mike@rhasspy.org", license="MIT", packages=setuptools.find_packages(), package_data={ "rhasspy3": ["VERSION", "py.typed"], }, install_requires=requirements, classifiers=[ "Development Status :: 3 - Alpha", "Intended Audience :: Developers", "Topic :: Text Processing :: Linguistic", "License :: OSI Approved :: License :: OSI Approved :: MIT License", "Programming Language :: Python :: 3.7", "Programming Language :: Python :: 3.8", "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", ], keywords="voice assistant rhasspy", ) ================================================ FILE: tests/test_dataclasses_json.py ================================================ from dataclasses import dataclass from typing import Dict, List, Optional from rhasspy3.util.dataclasses_json import DataClassJsonMixin @dataclass class Class1(DataClassJsonMixin): name: str @dataclass class Class2(DataClassJsonMixin): name: str obj1: Class1 list1: List[Class1] dict1: Dict[str, Class1] opt1: Optional[Class1] _DICT = { "name": "2", "obj1": {"name": "1"}, "list1": [{"name": "1-2"}], "dict1": {"key": {"name": "1-3"}}, "opt1": {"name": "1-4"}, } _OBJ = Class2( name="2", obj1=Class1(name="1"), list1=[Class1(name="1-2")], dict1={"key": Class1(name="1-3")}, opt1=Class1(name="1-4"), ) def test_to_dict(): assert _OBJ.to_dict() == _DICT def test_from_dict(): assert Class2.from_dict(_DICT) == _OBJ ================================================ FILE: tests/test_jaml.py ================================================ import io from rhasspy3.util.jaml import safe_load YAML = """ # Line comment outer_a: # Inline comment name: outer_a prop_int: 1 prop_float: 1.23 prop_bool: true prop_bool2: false prop_str_noquotes: hello: world prop_str_1quotes: 'hello: world' prop_str_2quotes: "hello: world" prop_str_literal: | hello: world inner_a: name: inner_a empty_string: "" string_with_hash: "#test" outer_b: name: inner_b """ def test_safe_load(): with io.StringIO(YAML) as yaml: assert safe_load(yaml) == { "outer_a": { "name": "outer_a", "prop_int": 1, "prop_float": 1.23, "prop_bool": True, "prop_bool2": False, "prop_str_noquotes": "hello: world", "prop_str_1quotes": "hello: world", "prop_str_2quotes": "hello: world", "prop_str_literal": "hello:\nworld", "inner_a": {"name": "inner_a"}, "empty_string": "", "string_with_hash": "#test", }, "outer_b": {"name": "inner_b"}, } ================================================ FILE: tools/websocket-client/bin/websocket_client.py ================================================ #!/usr/bin/env python3 import argparse import asyncio import wave from urllib.parse import urlencode, urlparse, parse_qsl, urlunparse from websockets import connect async def main() -> None: parser = argparse.ArgumentParser() parser.add_argument("url") parser.add_argument("wav_file", nargs="+", help="Path(s) to WAV file(s)") parser.add_argument("--samples-per-chunk", type=int, default=1024) args = parser.parse_args() for wav_path in args.wav_file: wav_file: wave.Wave_read = wave.open(wav_path, "rb") with wav_file: # Add audio parameters if missing parse_result = urlparse(args.url) query = dict(parse_qsl(parse_result.query)) query.setdefault("rate", str(wav_file.getframerate())) query.setdefault("width", str(wav_file.getsampwidth())) query.setdefault("channels", str(wav_file.getnchannels())) url = urlunparse(parse_result._replace(query=urlencode(query))) async with connect(url) as websocket: chunk = wav_file.readframes(args.samples_per_chunk) while chunk: await websocket.send(chunk) chunk = wav_file.readframes(args.samples_per_chunk) # Signal stop with empty message await websocket.send(bytes()) result = await websocket.recv() print(result) if __name__ == "__main__": asyncio.run(main()) ================================================ FILE: tools/websocket-client/requirements.txt ================================================ websockets ================================================ FILE: tools/websocket-client/script/run ================================================ #!/usr/bin/env bash set -eo pipefail # Directory of *this* script this_dir="$( cd "$( dirname "$0" )" && pwd )" # Base directory of repo base_dir="$(realpath "${this_dir}/..")" # Path to virtual environment : "${venv:=${base_dir}/.venv}" if [ -d "${venv}" ]; then source "${venv}/bin/activate" fi export PATH="${base_dir}/bin:${PATH}" python3 "${base_dir}/bin/websocket_client.py" "$@" ================================================ FILE: tools/websocket-client/script/setup ================================================ #!/usr/bin/env bash set -eo pipefail # Directory of *this* script this_dir="$( cd "$( dirname "$0" )" && pwd )" # Base directory of repo base_dir="$(realpath "${this_dir}/..")" # Path to virtual environment : "${venv:=${base_dir}/.venv}" # Python binary to use : "${PYTHON=python3}" python_version="$(${PYTHON} --version)" # Create virtual environment echo "Creating virtual environment at ${venv} (${python_version})" rm -rf "${venv}" "${PYTHON}" -m venv "${venv}" source "${venv}/bin/activate" # Install Python dependencies echo 'Installing Python dependencies' pip3 install --upgrade pip pip3 install --upgrade wheel setuptools pip3 install -r "${base_dir}/requirements.txt" # ----------------------------------------------------------------------------- echo "OK"