[
  {
    "path": ".github/dependabot.yml",
    "content": "version: 2\nupdates:\n- package-ecosystem: pip\n  directory: \"/\"\n  schedule:\n    interval: daily\n  open-pull-requests-limit: 10\n"
  },
  {
    "path": "LICENSE",
    "content": "MIT License\n\nCopyright (c) 2021 Yuichiro Tachibana (Tsuchiya)\n\nPermission is hereby granted, free of charge, to any person obtaining a copy\nof this software and associated documentation files (the \"Software\"), to deal\nin the Software without restriction, including without limitation the rights\nto use, copy, modify, merge, publish, distribute, sublicense, and/or sell\ncopies of the Software, and to permit persons to whom the Software is\nfurnished to do so, subject to the following conditions:\n\nThe above copyright notice and this permission notice shall be included in all\ncopies or substantial portions of the Software.\n\nTHE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\nIMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\nFITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\nAUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\nLIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\nOUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\nSOFTWARE.\n"
  },
  {
    "path": "Makefile",
    "content": "deps/update:\n\tpipreqs --print . | sort > requirements.txt\n"
  },
  {
    "path": "README.md",
    "content": "# streamlit-stt-app\n\n[![Open in Streamlit](https://static.streamlit.io/badges/streamlit_badge_black_white.svg)](https://share.streamlit.io/whitphx/streamlit-stt-app/main/app_deepspeech.py)\n\n[![ko-fi](https://ko-fi.com/img/githubbutton_sm.svg)](https://ko-fi.com/D1D2ERWFG)\n\n<a href=\"https://www.buymeacoffee.com/whitphx\" target=\"_blank\"><img src=\"https://cdn.buymeacoffee.com/buttons/v2/default-yellow.png\" alt=\"Buy Me A Coffee\" width=\"180\" height=\"50\" ></a>\n\n[![GitHub Sponsors](https://img.shields.io/github/sponsors/whitphx?label=Sponsor%20me%20on%20GitHub%20Sponsors&style=social)](https://github.com/sponsors/whitphx)\n\nA real time Speech-to-Text app built with Streamlit and [`streamlit-webrtc`](https://github.com/whitphx/streamlit-webrtc).\nThe STT engine is [`mozilla/DeepSpeech`](https://github.com/mozilla/DeepSpeech).\n\n[Hosted on Streamlit Sharing](https://share.streamlit.io/whitphx/streamlit-stt-app/main/app_deepspeech.py).\n\nThe forum post about this app is\nhttps://discuss.streamlit.io/t/new-component-streamlit-webrtc-a-new-way-to-deal-with-real-time-media-streams/8669/31.\n\n![](./docs/img/example.gif)\n\n## Other examples\n- STT with Whisper API by @goldengrape: https://github.com/whitphx/streamlit-stt-app/issues/133\n\n"
  },
  {
    "path": "app_deepspeech.py",
    "content": "import logging\nimport logging.handlers\nimport queue\nimport threading\nimport time\nimport urllib.request\nimport os\nfrom collections import deque\nfrom pathlib import Path\nfrom typing import List\n\nimport av\nimport numpy as np\nimport pydub\nimport streamlit as st\nfrom twilio.rest import Client\n\nfrom streamlit_webrtc import WebRtcMode, webrtc_streamer\n\nHERE = Path(__file__).parent\n\nlogger = logging.getLogger(__name__)\n\n\n# This code is based on https://github.com/streamlit/demo-self-driving/blob/230245391f2dda0cb464008195a470751c01770b/streamlit_app.py#L48  # noqa: E501\ndef download_file(url, download_to: Path, expected_size=None):\n    # Don't download the file twice.\n    # (If possible, verify the download using the file length.)\n    if download_to.exists():\n        if expected_size:\n            if download_to.stat().st_size == expected_size:\n                return\n        else:\n            st.info(f\"{url} is already downloaded.\")\n            if not st.button(\"Download again?\"):\n                return\n\n    download_to.parent.mkdir(parents=True, exist_ok=True)\n\n    # These are handles to two visual elements to animate.\n    weights_warning, progress_bar = None, None\n    try:\n        weights_warning = st.warning(\"Downloading %s...\" % url)\n        progress_bar = st.progress(0)\n        with open(download_to, \"wb\") as output_file:\n            with urllib.request.urlopen(url) as response:\n                length = int(response.info()[\"Content-Length\"])\n                counter = 0.0\n                MEGABYTES = 2.0 ** 20.0\n                while True:\n                    data = response.read(8192)\n                    if not data:\n                        break\n                    counter += len(data)\n                    output_file.write(data)\n\n                    # We perform animation by overwriting the elements.\n                    weights_warning.warning(\n                        \"Downloading %s... (%6.2f/%6.2f MB)\"\n                        % (url, counter / MEGABYTES, length / MEGABYTES)\n                    )\n                    progress_bar.progress(min(counter / length, 1.0))\n    # Finally, we remove these visual elements by calling .empty().\n    finally:\n        if weights_warning is not None:\n            weights_warning.empty()\n        if progress_bar is not None:\n            progress_bar.empty()\n\n\n# This code is based on https://github.com/whitphx/streamlit-webrtc/blob/c1fe3c783c9e8042ce0c95d789e833233fd82e74/sample_utils/turn.py\n@st.cache_data  # type: ignore\ndef get_ice_servers():\n    \"\"\"Use Twilio's TURN server because Streamlit Community Cloud has changed\n    its infrastructure and WebRTC connection cannot be established without TURN server now.  # noqa: E501\n    We considered Open Relay Project (https://www.metered.ca/tools/openrelay/) too,\n    but it is not stable and hardly works as some people reported like https://github.com/aiortc/aiortc/issues/832#issuecomment-1482420656  # noqa: E501\n    See https://github.com/whitphx/streamlit-webrtc/issues/1213\n    \"\"\"\n\n    # Ref: https://www.twilio.com/docs/stun-turn/api\n    try:\n        account_sid = os.environ[\"TWILIO_ACCOUNT_SID\"]\n        auth_token = os.environ[\"TWILIO_AUTH_TOKEN\"]\n    except KeyError:\n        logger.warning(\n            \"Twilio credentials are not set. Fallback to a free STUN server from Google.\"  # noqa: E501\n        )\n        return [{\"urls\": [\"stun:stun.l.google.com:19302\"]}]\n\n    client = Client(account_sid, auth_token)\n\n    token = client.tokens.create()\n\n    return token.ice_servers\n\n\n\ndef main():\n    st.header(\"Real Time Speech-to-Text\")\n    st.markdown(\n        \"\"\"\nThis demo app is using [DeepSpeech](https://github.com/mozilla/DeepSpeech),\nan open speech-to-text engine.\n\nA pre-trained model released with\n[v0.9.3](https://github.com/mozilla/DeepSpeech/releases/tag/v0.9.3),\ntrained on American English is being served.\n\"\"\"\n    )\n\n    # https://github.com/mozilla/DeepSpeech/releases/tag/v0.9.3\n    MODEL_URL = \"https://github.com/mozilla/DeepSpeech/releases/download/v0.9.3/deepspeech-0.9.3-models.pbmm\"  # noqa\n    LANG_MODEL_URL = \"https://github.com/mozilla/DeepSpeech/releases/download/v0.9.3/deepspeech-0.9.3-models.scorer\"  # noqa\n    MODEL_LOCAL_PATH = HERE / \"models/deepspeech-0.9.3-models.pbmm\"\n    LANG_MODEL_LOCAL_PATH = HERE / \"models/deepspeech-0.9.3-models.scorer\"\n\n    download_file(MODEL_URL, MODEL_LOCAL_PATH, expected_size=188915987)\n    download_file(LANG_MODEL_URL, LANG_MODEL_LOCAL_PATH, expected_size=953363776)\n\n    lm_alpha = 0.931289039105002\n    lm_beta = 1.1834137581510284\n    beam = 100\n\n    sound_only_page = \"Sound only (sendonly)\"\n    with_video_page = \"With video (sendrecv)\"\n    app_mode = st.selectbox(\"Choose the app mode\", [sound_only_page, with_video_page])\n\n    if app_mode == sound_only_page:\n        app_sst(\n            str(MODEL_LOCAL_PATH), str(LANG_MODEL_LOCAL_PATH), lm_alpha, lm_beta, beam\n        )\n    elif app_mode == with_video_page:\n        app_sst_with_video(\n            str(MODEL_LOCAL_PATH), str(LANG_MODEL_LOCAL_PATH), lm_alpha, lm_beta, beam\n        )\n\n\ndef app_sst(model_path: str, lm_path: str, lm_alpha: float, lm_beta: float, beam: int):\n    webrtc_ctx = webrtc_streamer(\n        key=\"speech-to-text\",\n        mode=WebRtcMode.SENDONLY,\n        audio_receiver_size=1024,\n        rtc_configuration={\"iceServers\": get_ice_servers()},\n        media_stream_constraints={\"video\": False, \"audio\": True},\n    )\n\n    status_indicator = st.empty()\n\n    if not webrtc_ctx.state.playing:\n        return\n\n    status_indicator.write(\"Loading...\")\n    text_output = st.empty()\n    stream = None\n\n    while True:\n        if webrtc_ctx.audio_receiver:\n            if stream is None:\n                from deepspeech import Model\n\n                model = Model(model_path)\n                model.enableExternalScorer(lm_path)\n                model.setScorerAlphaBeta(lm_alpha, lm_beta)\n                model.setBeamWidth(beam)\n\n                stream = model.createStream()\n\n                status_indicator.write(\"Model loaded.\")\n\n            sound_chunk = pydub.AudioSegment.empty()\n            try:\n                audio_frames = webrtc_ctx.audio_receiver.get_frames(timeout=1)\n            except queue.Empty:\n                time.sleep(0.1)\n                status_indicator.write(\"No frame arrived.\")\n                continue\n\n            status_indicator.write(\"Running. Say something!\")\n\n            for audio_frame in audio_frames:\n                sound = pydub.AudioSegment(\n                    data=audio_frame.to_ndarray().tobytes(),\n                    sample_width=audio_frame.format.bytes,\n                    frame_rate=audio_frame.sample_rate,\n                    channels=len(audio_frame.layout.channels),\n                )\n                sound_chunk += sound\n\n            if len(sound_chunk) > 0:\n                sound_chunk = sound_chunk.set_channels(1).set_frame_rate(\n                    model.sampleRate()\n                )\n                buffer = np.array(sound_chunk.get_array_of_samples())\n                stream.feedAudioContent(buffer)\n                text = stream.intermediateDecode()\n                text_output.markdown(f\"**Text:** {text}\")\n        else:\n            status_indicator.write(\"AudioReciver is not set. Abort.\")\n            break\n\n\ndef app_sst_with_video(\n    model_path: str, lm_path: str, lm_alpha: float, lm_beta: float, beam: int\n):\n    frames_deque_lock = threading.Lock()\n    frames_deque: deque = deque([])\n\n    async def queued_audio_frames_callback(\n        frames: List[av.AudioFrame],\n    ) -> av.AudioFrame:\n        with frames_deque_lock:\n            frames_deque.extend(frames)\n\n        # Return empty frames to be silent.\n        new_frames = []\n        for frame in frames:\n            input_array = frame.to_ndarray()\n            new_frame = av.AudioFrame.from_ndarray(\n                np.zeros(input_array.shape, dtype=input_array.dtype),\n                layout=frame.layout.name,\n            )\n            new_frame.sample_rate = frame.sample_rate\n            new_frames.append(new_frame)\n\n        return new_frames\n\n    webrtc_ctx = webrtc_streamer(\n        key=\"speech-to-text-w-video\",\n        mode=WebRtcMode.SENDRECV,\n        queued_audio_frames_callback=queued_audio_frames_callback,\n        rtc_configuration={\"iceServers\": get_ice_servers()},\n        media_stream_constraints={\"video\": True, \"audio\": True},\n    )\n\n    status_indicator = st.empty()\n\n    if not webrtc_ctx.state.playing:\n        return\n\n    status_indicator.write(\"Loading...\")\n    text_output = st.empty()\n    stream = None\n\n    while True:\n        if webrtc_ctx.state.playing:\n            if stream is None:\n                from deepspeech import Model\n\n                model = Model(model_path)\n                model.enableExternalScorer(lm_path)\n                model.setScorerAlphaBeta(lm_alpha, lm_beta)\n                model.setBeamWidth(beam)\n\n                stream = model.createStream()\n\n                status_indicator.write(\"Model loaded.\")\n\n            sound_chunk = pydub.AudioSegment.empty()\n\n            audio_frames = []\n            with frames_deque_lock:\n                while len(frames_deque) > 0:\n                    frame = frames_deque.popleft()\n                    audio_frames.append(frame)\n\n            if len(audio_frames) == 0:\n                time.sleep(0.1)\n                status_indicator.write(\"No frame arrived.\")\n                continue\n\n            status_indicator.write(\"Running. Say something!\")\n\n            for audio_frame in audio_frames:\n                sound = pydub.AudioSegment(\n                    data=audio_frame.to_ndarray().tobytes(),\n                    sample_width=audio_frame.format.bytes,\n                    frame_rate=audio_frame.sample_rate,\n                    channels=len(audio_frame.layout.channels),\n                )\n                sound_chunk += sound\n\n            if len(sound_chunk) > 0:\n                sound_chunk = sound_chunk.set_channels(1).set_frame_rate(\n                    model.sampleRate()\n                )\n                buffer = np.array(sound_chunk.get_array_of_samples())\n                stream.feedAudioContent(buffer)\n                text = stream.intermediateDecode()\n                text_output.markdown(f\"**Text:** {text}\")\n        else:\n            status_indicator.write(\"Stopped.\")\n            break\n\n\nif __name__ == \"__main__\":\n    import os\n\n    DEBUG = os.environ.get(\"DEBUG\", \"false\").lower() not in [\"false\", \"no\", \"0\"]\n\n    logging.basicConfig(\n        format=\"[%(asctime)s] %(levelname)7s from %(name)s in %(pathname)s:%(lineno)d: \"\n        \"%(message)s\",\n        force=True,\n    )\n\n    logger.setLevel(level=logging.DEBUG if DEBUG else logging.INFO)\n\n    st_webrtc_logger = logging.getLogger(\"streamlit_webrtc\")\n    st_webrtc_logger.setLevel(logging.DEBUG)\n\n    fsevents_logger = logging.getLogger(\"fsevents\")\n    fsevents_logger.setLevel(logging.WARNING)\n\n    main()\n"
  },
  {
    "path": "requirements.txt",
    "content": "av==9.2.0\ndeepspeech==0.9.3\nnumpy==1.23.0\npydub==0.25.1\nstreamlit==1.20.0\nstreamlit_webrtc==0.45.0\ntwilio~=8.1.0\n"
  }
]