Full Code of dusty-nv/jetson-voice for AI

master c6a8c9552c70 cached

115 files

749.0 KB

187.9k tokens

589 symbols

1 requests

Download .txt

Showing preview only (788K chars total). Download the full file or copy to clipboard to get everything.

Repository: dusty-nv/jetson-voice
Branch: master
Commit: c6a8c9552c70
Files: 115
Total size: 749.0 KB

Directory structure:
gitextract_8xzz9c2n/

├── .dockerignore
├── .gitignore
├── .gitmodules
├── Dockerfile.aarch64
├── Dockerfile.ros
├── Dockerfile.runtime
├── Dockerfile.x86_64
├── README.md
├── docker/
│   ├── build.sh
│   ├── push.sh
│   ├── run.sh
│   └── tag.sh
├── examples/
│   ├── asr.py
│   ├── assistant.py
│   ├── nlp.py
│   ├── nlp_qa.py
│   └── tts.py
├── jetson_voice/
│   ├── __init__.py
│   ├── asr.py
│   ├── auto.py
│   ├── backends/
│   │   ├── onnxruntime/
│   │   │   ├── __init__.py
│   │   │   └── ort_model.py
│   │   ├── riva/
│   │   │   ├── __init__.py
│   │   │   ├── riva_asr.py
│   │   │   └── riva_tts.py
│   │   └── tensorrt/
│   │       ├── __init__.py
│   │       ├── trt_binding.py
│   │       ├── trt_builder.py
│   │       └── trt_model.py
│   ├── models/
│   │   ├── __init__.py
│   │   ├── asr/
│   │   │   ├── __init__.py
│   │   │   ├── asr_engine.py
│   │   │   ├── ctc_beamsearch.py
│   │   │   ├── ctc_decoder.py
│   │   │   ├── ctc_greedy.py
│   │   │   └── ctc_utils.py
│   │   ├── nlp/
│   │   │   ├── __init__.py
│   │   │   ├── intent_slot.py
│   │   │   ├── nlp_utils.py
│   │   │   ├── question_answer.py
│   │   │   ├── text_classification.py
│   │   │   └── token_classification.py
│   │   └── tts/
│   │       ├── __init__.py
│   │       └── tts_engine.py
│   ├── nlp.py
│   ├── tts.py
│   └── utils/
│       ├── __init__.py
│       ├── audio.py
│       ├── config.py
│       ├── resource.py
│       └── softmax.py
├── patches/
│   ├── nemo/
│   │   ├── 1.0.0rc1/
│   │   │   ├── exportable.original.py
│   │   │   ├── exportable.py
│   │   │   ├── nlp/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── distilbert.diff
│   │   │   │   ├── distilbert.original.py
│   │   │   │   ├── distilbert.py
│   │   │   │   ├── huggingface_utils.py
│   │   │   │   ├── location.txt
│   │   │   │   └── mobilebert.py
│   │   │   ├── setup.original.py
│   │   │   └── setup.py
│   │   └── 1.6.2/
│   │       ├── requirements.original.txt
│   │       ├── requirements.txt
│   │       ├── requirements_nlp.original.txt
│   │       └── requirements_nlp.txt
│   ├── pytorch/
│   │   ├── 1.6.0/
│   │   │   ├── functional.diff
│   │   │   ├── functional.original.py
│   │   │   └── functional.py
│   │   └── 1.7.0/
│   │       ├── functional.diff
│   │       ├── functional.original.py
│   │       └── functional.py
│   └── transformers/
│       ├── 4.5.0/
│       │   ├── convert_graph_to_onnx.diff
│       │   ├── convert_graph_to_onnx.original.py
│       │   ├── convert_graph_to_onnx.py
│       │   └── modeling_distilbert.py
│       └── 4.5.1/
│           ├── convert_graph_to_onnx.diff
│           ├── convert_graph_to_onnx.original.py
│           ├── convert_graph_to_onnx.py
│           ├── modeling_distilbert.diff
│           ├── modeling_distilbert.original.py
│           └── modeling_distilbert.py
├── ros/
│   ├── CMakeLists.txt
│   ├── jetson_voice_ros/
│   │   ├── __init__.py
│   │   ├── asr.py
│   │   ├── audio_input.py
│   │   ├── audio_output.py
│   │   ├── nlp_intent_slot.py
│   │   ├── nlp_question_answer.py
│   │   └── tts.py
│   ├── launch/
│   │   ├── asr.launch.py
│   │   ├── audio_playback.launch.py
│   │   └── tts.launch.py
│   ├── msg/
│   │   ├── Audio.msg
│   │   ├── AudioInfo.msg
│   │   ├── IntentSlot.msg
│   │   ├── QuestionAnswerQuery.msg
│   │   ├── QuestionAnswerResult.msg
│   │   └── Slot.msg
│   └── package.xml
├── scripts/
│   ├── list_audio_devices.py
│   ├── list_models.py
│   ├── nemo_export_onnx.py
│   ├── nemo_list_models.py
│   ├── nemo_train_classifier.py
│   ├── nemo_train_intent.py
│   ├── nemo_train_ner.py
│   ├── nemo_train_qa.py
│   ├── os_version.sh
│   ├── record_mic.py
│   └── start_jupyter.sh
└── tests/
    ├── run_tests.py
    ├── test_asr.py
    ├── test_nlp.py
    └── test_tts.py

================================================
FILE CONTENTS
================================================

================================================
FILE: .dockerignore
================================================
data/
.git
.cache

================================================
FILE: .gitignore
================================================
data/
logs/
packages/
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class

# C extensions
*.so

# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
pip-wheel-metadata/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST

# PyInstaller
#  Usually these files are written by a python script from a template
#  before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec

# Installer logs
pip-log.txt
pip-delete-this-directory.txt

# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/

# Translations
*.mo
*.pot

# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal

# Flask stuff:
instance/
.webassets-cache

# Scrapy stuff:
.scrapy

# Sphinx documentation
docs/_build/

# PyBuilder
target/

# Jupyter Notebook
.ipynb_checkpoints

# IPython
profile_default/
ipython_config.py

# pyenv
.python-version

# pipenv
#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
#   However, in case of collaboration, if having platform-specific dependencies or dependencies
#   having no cross-platform support, pipenv may install dependencies that don't work, or not
#   install all needed dependencies.
#Pipfile.lock

# PEP 582; used by e.g. github.com/David-OConnor/pyflow
__pypackages__/

# Celery stuff
celerybeat-schedule
celerybeat.pid

# SageMath parsed files
*.sage.py

# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/

# Spyder project settings
.spyderproject
.spyproject

# Rope project settings
.ropeproject

# mkdocs documentation
/site

# mypy
.mypy_cache/
.dmypy.json
dmypy.json

# Pyre type checker
.pyre/


================================================
FILE: .gitmodules
================================================
[submodule "docker/containers"]
	path = docker/containers
	url = https://github.com/dusty-nv/jetson-containers


================================================
FILE: Dockerfile.aarch64
================================================
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a
# copy of this software and associated documentation files (the "Software"),
# to deal in the Software without restriction, including without limitation
# the rights to use, copy, modify, merge, publish, distribute, sublicense,
# and/or sell copies of the Software, and to permit persons to whom the
# Software is furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
# DEALINGS IN THE SOFTWARE.

ARG BASE_IMAGE
FROM ${BASE_IMAGE}

ENV DEBIAN_FRONTEND=noninteractive
ENV SHELL /bin/bash
ENV LANG='en_US.UTF-8' LANGUAGE='en_US:en' LC_ALL='en_US.UTF-8'
ARG MAKEFLAGS=-j$(nproc)
ARG WORKSPACE=/jetson-voice

WORKDIR ${WORKSPACE}

# alias python3 -> python
RUN rm /usr/bin/python && \
    ln -s /usr/bin/python3 /usr/bin/python && \
    ln -s /usr/bin/pip3 /usr/bin/pip


################################################################
## tokenizers/transformers
################################################################
RUN apt-get update && \
    apt-get install -y --no-install-recommends \
            cmake \
		  curl \
		  pkg-config \
		  protobuf-compiler \
		  libprotoc-dev \
		  nano \
		  tzdata \
		  libssl-dev \
    && rm -rf /var/lib/apt/lists/* \
    && apt-get clean
    
# install sentencepiece
RUN git clone https://github.com/google/sentencepiece && \
	cd sentencepiece && \
	mkdir build && \
	cd build && \
	cmake .. && \
	make -j $(nproc) && \
	make install && \
	ldconfig -v && \
	cd .. && \
	cd python && \
	python3 setup.py install --verbose && \
	cd ../../ && \
	rm -r -f sentencepiece

# install rust (used by tokenizers)
RUN curl https://sh.rustup.rs -sSf | sh -s -- -y
ENV PATH="/root/.cargo/bin:${PATH}"
RUN rustc --version && \
    pip3 install setuptools-rust

# install tokenizers
RUN pip3 install tokenizers --verbose

# Apache arrow is needed by datasets package ('pip install pyarrow' is broken, so built from source)
#  https://github.com/apache/arrow/blob/master/docs/source/developers/python.rst#using-pip
#  https://raspberrypi.stackexchange.com/a/117723
RUN apt-get update && \
    apt-get install -y --no-install-recommends \
			libjemalloc-dev \
			libboost-dev \
			libboost-filesystem-dev \
			libboost-system-dev \
			libboost-regex-dev \
			autoconf \
			flex \
			bison \
    && rm -rf /var/lib/apt/lists/* \
    && apt-get clean 

RUN git clone --branch apache-arrow-3.0.0 https://github.com/apache/arrow.git && \
	cd arrow/cpp && \
	mkdir build && \
	cd build && \
	export ARROW_HOME=/usr/local && \
	cmake \
		-DCMAKE_INSTALL_PREFIX=$ARROW_HOME \
		-DCMAKE_INSTALL_LIBDIR=lib \
		-DARROW_WITH_BZ2=ON \
		-DARROW_WITH_ZLIB=ON \
		-DARROW_WITH_ZSTD=ON \
		-DARROW_WITH_LZ4=ON \
		-DARROW_WITH_SNAPPY=ON \
		-DARROW_PARQUET=ON \
		-DARROW_CUDA=ON \
		-DARROW_PYTHON=ON \
		-DARROW_BUILD_TESTS=OFF \
		.. && \
	make -j$(nproc) && \
	make install && \
	cd ../../python && \
	python3 setup.py build_ext --build-type=release --with-parquet --with-cuda --verbose && \
	python3 setup.py install --verbose && \
	cd ../../ && \
	rm -r -f arrow

RUN pip3 show pyarrow && \
	python3 -c "import pyarrow" && \
	python3 -c "from pyarrow import cuda"
	
# install huggingface (locked to 4.5.1, which the patches are based on)
# datasets package is needed to run the huggingface examples
RUN pip3 install transformers==4.5.1 datasets --verbose
  

################################################################
## onnx / onnxruntime / onnx-graphsurgeon
################################################################
ARG ONNXRUNTIME_URL=https://nvidia.box.com/shared/static/ukszbm1iklzymrt54mgxbzjfzunq7i9t.whl
ARG ONNXRUNTIME_WHL=onnxruntime_gpu-1.7.0-cp36-cp36m-linux_aarch64.whl

RUN wget --quiet --show-progress --progress=bar:force:noscroll --no-check-certificate ${ONNXRUNTIME_URL} -O ${ONNXRUNTIME_WHL} && \
    pip3 install ${ONNXRUNTIME_WHL} --verbose && \
    pip3 install onnx psutil sympy --verbose && \
    rm ${ONNXRUNTIME_WHL}

# install onnx-graphsurgeon
RUN cd /opt && \
    git clone --recursive https://github.com/nvidia/tensorrt tensorrt && \
    cd tensorrt/tools/onnx-graphsurgeon && \
    python3 setup.py install --verbose && \
    cd ../../../ && \
    rm -r -f tensorrt
    
    
################################################################
## NeMo
################################################################
RUN apt-get update && \
    apt-get install -y --no-install-recommends \
		  libopencc-dev \
		  python3-tk \
		  libmecab-dev \
		  mecab \
    && rm -rf /var/lib/apt/lists/* \
    && apt-get clean
    
RUN cd /opt && \
    git clone --recursive --branch v0.11.1 https://github.com/pytorch/text torchtext && \
    cd torchtext && \
    python3 setup.py clean install 
    
RUN pip3 show torch torchvision torchaudio torchtext

# clone/build nemo
ARG NEMO_VERSION
RUN cd /opt && git clone --recursive --branch v${NEMO_VERSION} https://github.com/nvidia/nemo

# needed for nemo 1.0
#COPY patches/nemo/${NEMO_VERSION}/setup.py /opt/nemo/setup.py

# needed for nemo 1.6
COPY patches/nemo/${NEMO_VERSION}/requirements.txt /opt/nemo/requirements/requirements.txt
COPY patches/nemo/${NEMO_VERSION}/requirements_nlp.txt /opt/nemo/requirements/requirements_nlp.txt

RUN pip3 install -r /opt/nemo/requirements/requirements.txt --verbose
RUN pip3 install -r /opt/nemo/requirements/requirements_asr.txt --verbose
RUN pip3 install -r /opt/nemo/requirements/requirements_nlp.txt --verbose
RUN pip3 install -r /opt/nemo/requirements/requirements_tts.txt --verbose
#RUN pip3 install omegaconf==2.1.0dev24 --verbose

RUN cd /opt/nemo && python3 setup.py install --verbose


################################################################
## ctc-decoders
################################################################
RUN apt-get update && \
    apt-get install -y --no-install-recommends \
		  swig \
    && rm -rf /var/lib/apt/lists/* \
    && apt-get clean
    
RUN git clone https://github.com/dusty-nv/OpenSeq2Seq -b ctc-decoders && \
    cd OpenSeq2Seq/decoders && \
    ./setup.sh
    
RUN pip3 install git+https://github.com/NVIDIA/dllogger
RUN pip3 install nltk


################################################################
## Riva GRPC
################################################################
ARG RIVA_URL=https://nvidia.box.com/shared/static/cu8z4t1n6shkxl6z5nh9hpkpn9yxomcz.whl
ARG RIVA_WHL=riva_api-1.0.0ea-py3-none-any.whl

RUN wget --quiet --show-progress --progress=bar:force:noscroll --no-check-certificate ${RIVA_URL} -O ${RIVA_WHL} && \
    pip3 install ${RIVA_WHL} --verbose && \
    rm ${RIVA_WHL}


################################################################
## install some audio stuff
################################################################
RUN apt-get update && \
    apt-get install -y --no-install-recommends \
		  alsa-base \
            libasound2-dev \
            alsa-utils \
            portaudio19-dev \
		  libsndfile1 \
		  unzip \
    && rm -rf /var/lib/apt/lists/* \
    && apt-get clean
    
RUN pip3 install soundfile pyaudio wave


################################################################
## various patches to install
################################################################
#COPY patches patches

#RUN PYTHON_ROOT=`pip3 show torch | grep Location: | cut -d' ' -f2` && \
#    PYTORCH_VERSION=`pip3 show torch | grep Version: | cut -d' ' -f2` && \
#    TRANSFORMERS_VERSION=`pip3 show transformers | grep Version: | cut -d' ' -f2` && \
#    NEMO_PATH="$PYTHON_ROOT/nemo_toolkit-${NEMO_VERSION}-py3.6.egg/nemo" && \
#    echo "Python package root path:  $PYTHON_ROOT" && \
#    echo "Applying patches for PyTorch $PYTORCH_VERSION" && \
#    echo "Applying patches for transformers $TRANSFORMERS_VERSION" && \
#    cp patches/pytorch/$PYTORCH_VERSION/functional.py $PYTHON_ROOT/torch/functional.py && \
#    cp patches/transformers/$TRANSFORMERS_VERSION/convert_graph_to_onnx.py $PYTHON_ROOT/transformers/convert_graph_to_onnx.py && \
#    cp patches/transformers/$TRANSFORMERS_VERSION/modeling_distilbert.py $PYTHON_ROOT/transformers/models/distilbert/modeling_distilbert.py && \
#    cp patches/nemo/${NEMO_VERSION}/nlp/distilbert.py $NEMO_PATH/collections/nlp/modules/common/huggingface/distilbert.py && \
#    cp patches/nemo/${NEMO_VERSION}/exportable.py $NEMO_PATH/core/classes/exportable.py


# set Python to unicode
ENV PYTHONIOENCODING=utf-8

# disable JupyterLab from auto-starting (inherited behavior from l4t-ml)
CMD /bin/bash


================================================
FILE: Dockerfile.ros
================================================
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a
# copy of this software and associated documentation files (the "Software"),
# to deal in the Software without restriction, including without limitation
# the rights to use, copy, modify, merge, publish, distribute, sublicense,
# and/or sell copies of the Software, and to permit persons to whom the
# Software is furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
# DEALINGS IN THE SOFTWARE.

ARG BASE_IMAGE=jetson-voice:r32.5.0-foxy-base
FROM ${BASE_IMAGE}


################################################################
## install jetson_voice_ros package
################################################################
COPY ros /tmp/jetson_voice_ros
    
RUN source ${ROS_ROOT}/install/setup.bash && \
    mkdir -p ${ROS_ROOT}/src && \
    cd ${ROS_ROOT} && \
    cp -r /tmp/jetson_voice_ros src && \
    
    # build the package
    colcon build \
        --merge-install \
	    --base-paths src/jetson_voice_ros \
        --event-handlers console_direct+ && \
	  
    # clean-up build files
    rm -rf ${ROS_ROOT}/src && \
    rm -rf ${ROS_ROOT}/logs && \
    rm -rf ${ROS_ROOT}/build


################################################################
## project install
################################################################
ARG WORKSPACE=/jetson-voice

COPY jetson_voice ${WORKSPACE}/jetson_voice
COPY examples ${WORKSPACE}/examples
COPY scripts ${WORKSPACE}/scripts
COPY tests ${WORKSPACE}/tests

ENV PYTHONPATH="${WORKSPACE}:${PYTHONPATH}"


================================================
FILE: Dockerfile.runtime
================================================
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a
# copy of this software and associated documentation files (the "Software"),
# to deal in the Software without restriction, including without limitation
# the rights to use, copy, modify, merge, publish, distribute, sublicense,
# and/or sell copies of the Software, and to permit persons to whom the
# Software is furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
# DEALINGS IN THE SOFTWARE.

ARG BASE_IMAGE=jetson-voice:r32.5.0-base
FROM ${BASE_IMAGE}

ARG WORKSPACE=/jetson-voice
WORKDIR ${WORKSPACE}


################################################################
## project install
################################################################
COPY jetson_voice ${WORKSPACE}/jetson_voice
COPY examples ${WORKSPACE}/examples
COPY scripts ${WORKSPACE}/scripts
COPY tests ${WORKSPACE}/tests

ENV PYTHONPATH="${WORKSPACE}:${PYTHONPATH}"

================================================
FILE: Dockerfile.x86_64
================================================
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a
# copy of this software and associated documentation files (the "Software"),
# to deal in the Software without restriction, including without limitation
# the rights to use, copy, modify, merge, publish, distribute, sublicense,
# and/or sell copies of the Software, and to permit persons to whom the
# Software is furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
# DEALINGS IN THE SOFTWARE.

ARG BASE_IMAGE
FROM ${BASE_IMAGE}

ENV DEBIAN_FRONTEND=noninteractive
ENV SHELL /bin/bash
ARG MAKEFLAGS=-j$(nproc)
ARG WORKSPACE=/jetson-voice

WORKDIR ${WORKSPACE}


################################################################
## PyCUDA
################################################################
RUN pip3 install pycuda six --verbose


################################################################
## ctc-decoders
################################################################
RUN apt-get update && \
    apt-get install -y --no-install-recommends \
		  swig \
    && rm -rf /var/lib/apt/lists/* \
    && apt-get clean
    
RUN git clone https://github.com/dusty-nv/OpenSeq2Seq -b ctc-decoders && \
    cd OpenSeq2Seq/decoders && \
    ./setup.sh
    
RUN pip3 install git+https://github.com/NVIDIA/dllogger
RUN pip3 install nltk


################################################################
## Jarvis GRPC
################################################################
ARG JARVIS_URL=https://nvidia.box.com/shared/static/on9t7zqes2s6er1wpumidnc6rphwsyy7.whl
ARG JARVIS_WHL=jarvis_api-1.0.0b1-py3-none-any.whl

RUN wget --quiet --show-progress --progress=bar:force:noscroll --no-check-certificate ${JARVIS_URL} -O ${JARVIS_WHL} && \
    pip3 install ${JARVIS_WHL} --verbose && \
    rm ${JARVIS_WHL}
    
    
################################################################
## install some audio stuff
################################################################
RUN apt-get update && \
    apt-get install -y --no-install-recommends \
		  alsa-base \
            libasound2-dev \
            alsa-utils \
            portaudio19-dev \
		  libsndfile1 \
		  unzip \
		  tzdata \
		  nano \
    && rm -rf /var/lib/apt/lists/* \
    && apt-get clean
    
RUN pip3 install soundfile pyaudio wave


################################################################
## various patches to install
################################################################
COPY patches patches

ARG NEMO_VERSION
RUN PYTHON_ROOT=`pip3 show transformers | grep Location: | cut -d' ' -f2` && \
    TRANSFORMERS_VERSION=`pip3 show transformers | grep Version: | cut -d' ' -f2` && \
    echo "Python package root path:  $PYTHON_ROOT" && \
    echo "Applying patches for transformers $TRANSFORMERS_VERSION" && \
    cp patches/transformers/$TRANSFORMERS_VERSION/convert_graph_to_onnx.py $PYTHON_ROOT/transformers/convert_graph_to_onnx.py && \
    cp patches/transformers/$TRANSFORMERS_VERSION/modeling_distilbert.py $PYTHON_ROOT/transformers/models/distilbert/modeling_distilbert.py && \
    cp patches/nemo/${NEMO_VERSION}/nlp/distilbert.py $PYTHON_ROOT/nemo/collections/nlp/modules/common/huggingface/distilbert.py && \
    cp patches/nemo/${NEMO_VERSION}/exportable.py $PYTHON_ROOT/nemo/core/classes/exportable.py


# set Python to unicode
ENV PYTHONIOENCODING=utf-8
    

================================================
FILE: README.md
================================================
# jetson-voice

jetson-voice is an ASR/NLP/TTS deep learning inference library for Jetson Nano, TX1/TX2, Xavier NX, and AGX Xavier.  It supports Python and JetPack 4.4.1 or newer.  The DNN models were trained with [NeMo](https://github.com/NVIDIA/NeMo) and deployed with [TensorRT](https://developer.nvidia.com/tensorrt) for optimized performance.  All computation is performed using the onboard GPU.

Currently the following capabilities are included:

* [Automatic Speech Recognition (ASR)](#automatic-speech-recognition-asr)
	* [Streaming ASR (QuartzNet)](#automatic-speech-recognition-asr) 
	* [Command/Keyword Recognition (MatchboxNet)](#commandkeyword-recognition)
	* [Voice Activity Detection (VAD Marblenet)](#voice-activity-detection-vad)
* [Natural Language Processing (NLP)](#natural-language-processing-nlp)
	* [Joint Intent/Slot Classification](#joint-intentslot-classification)
	* [Text Classification (Sentiment Analysis)](#text-classification)
	* [Token Classification (Named Entity Recognition)](#token-classification)
	* [Question/Answering (QA)](#questionanswering)
* [Text-to-Speech (TTS)](#text-to-speech-tts)
	
The NLP models are using the [DistilBERT](https://arxiv.org/abs/1910.01108) transformer architecture for reduced memory usage and increased performance.  For samples of the text-to-speech output, see the [TTS Audio Samples](#tts-audio-samples) section below.

## Running the Container

jetson-voice is distributed as a Docker container due to the number of dependencies.  There are pre-built containers images available on DockerHub for JetPack 4.4.1 and newer:

```
dustynv/jetson-voice:r32.4.4    # JetPack 4.4.1 (L4T R32.4.4)
dustynv/jetson-voice:r32.5.0    # JetPack 4.5 (L4T R32.5.0) / JetPack 4.5.1 (L4T R32.5.1)
dustynv/jetson-voice:r32.6.1    # JetPack 4.6 (L4T R32.6.1)
dustynv/jetson-voice:r32.7.1    # JetPack 4.6.1 (L4T R32.7.1)
```

To download and run the container, you can simply clone this repo and use the `docker/run.sh` script:

``` bash
$ git clone --branch dev https://github.com/dusty-nv/jetson-voice
$ cd jetson-voice
$ docker/run.sh
```

> **note**:  if you want to use a USB microphone or speaker, plug it in *before* you start the container

There are some optional arguments to `docker/run.sh` that you can use:

* `-r` (`--run`) specifies a run command, otherwise the container will start in an interactive shell.
* `-v` (`--volume`) mount a directory from the host into the container (`/host/path:/container/path`)
* `--dev` starts the container in development mode, where all the source files are mounted for easy editing

The run script will automatically mount the `data/` directory into the container, which stores the models and other data files.  If you save files from the container there, they will also show up under `data/` on the host.

## Automatic Speech Recognition (ASR)

The speech recognition in jetson-voice is a streaming service, so it's intended to be used on live sources and transcribes the audio in 1-second chunks.  It uses a [QuartzNet-15x5](https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/asr/models.html#quartznet) model followed by a CTC beamsearch decoder and language model, to further refine the raw output of the network.  It detects breaks in the audio to determine the end of sentences.  For information about using the ASR APIs, please refer to [`jetson_voice/asr.py`](jetson_voice/asr.py) and see [`examples/asr.py`](examples/asr.py)

After you start the container, first run a test audio file (wav/ogg/flac) through [`examples/asr.py`](examples/asr.py) to verify that the system is functional.  Run this command (and all subsequent commands) inside the container:

``` bash
$ examples/asr.py --wav data/audio/dusty.wav

hi
hi hi this is dust
hi hi this is dusty check
hi hi this is dusty check one two
hi hi this is dusty check one two three
hi hi this is dusty check one two three.

what's the weather or
what's the weather going to be tomorrow
what's the weather going to be tomorrow in pittsburgh
what's the weather going to be tomorrow in pittsburgh.

today is
today is wednesday
today is wednesday tomorrow is thursday
today is wednesday tomorrow is thursday.

i would like
i would like to order a large
i would like to order a large pepperoni pizza
i would like to order a large pepperoni pizza.

is it going to be
is it going to be cloudy tomorrow.
```

> The first time you run each model, TensorRT will take a few minutes to optimize it.  
> This optimized model is then cached to disk, so the next time you run the model it will load faster.

#### Live Microphone

To test the ASR on a mic, first list the audio devices in your system to get the audio device ID's:

``` bash
$ scripts/list_audio_devices.sh

----------------------------------------------------
 Audio Input Devices
----------------------------------------------------
Input Device ID 1 - 'tegra-snd-t210ref-mobile-rt565x: - (hw:1,0)' (inputs=16) (sample_rate=44100)
Input Device ID 2 - 'tegra-snd-t210ref-mobile-rt565x: - (hw:1,1)' (inputs=16) (sample_rate=44100)
Input Device ID 3 - 'tegra-snd-t210ref-mobile-rt565x: - (hw:1,2)' (inputs=16) (sample_rate=44100)
Input Device ID 4 - 'tegra-snd-t210ref-mobile-rt565x: - (hw:1,3)' (inputs=16) (sample_rate=44100)
Input Device ID 5 - 'tegra-snd-t210ref-mobile-rt565x: - (hw:1,4)' (inputs=16) (sample_rate=44100)
Input Device ID 6 - 'tegra-snd-t210ref-mobile-rt565x: - (hw:1,5)' (inputs=16) (sample_rate=44100)
Input Device ID 7 - 'tegra-snd-t210ref-mobile-rt565x: - (hw:1,6)' (inputs=16) (sample_rate=44100)
Input Device ID 8 - 'tegra-snd-t210ref-mobile-rt565x: - (hw:1,7)' (inputs=16) (sample_rate=44100)
Input Device ID 9 - 'tegra-snd-t210ref-mobile-rt565x: - (hw:1,8)' (inputs=16) (sample_rate=44100)
Input Device ID 10 - 'tegra-snd-t210ref-mobile-rt565x: - (hw:1,9)' (inputs=16) (sample_rate=44100)
Input Device ID 11 - 'Logitech H570e Mono: USB Audio (hw:2,0)' (inputs=2) (sample_rate=44100)
Input Device ID 12 - 'Samson Meteor Mic: USB Audio (hw:3,0)' (inputs=2) (sample_rate=44100)
```

> If you don't see your audio device listed, exit and restart the container.  
> USB devices should be attached *before* the container is started.

Then run the ASR example with the `--mic <DEVICE>` option, and specify either the device ID or name:

``` bash
$ examples/asr.py --mic 11

hey
hey how are you guys
hey how are you guys.

# (Press Ctrl+C to exit)
```

## ASR Classification

There are other ASR models included for command/keyword recognition ([MatchboxNet](https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/asr/speech_classification/models.html#matchboxnet-speech-commands)) and voice activity detection ([VAD MarbleNet](https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/asr/speech_classification/models.html#marblenet-vad)).  These models are smaller and faster, and classify chunks of audio as opposed to transcribing text.  

### Command/Keyword Recognition

The [MatchboxNet](https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/asr/speech_classification/models.html#matchboxnet-speech-commands) model was trained on 12 keywords from the [Google Speech Commands](https://ai.googleblog.com/2017/08/launching-speech-commands-dataset.html) dataset:

```
# MatchboxNet classes
"yes",
"no",
"up",
"down",
"left",
"right",
"on",
"off",
"stop",
"go",
"unknown",
"silence"
```

You can run it through the same ASR example as above by specifying the `--model matchboxnet` argument:

``` bash
$ examples/asr.py --model matchboxnet --wav data/audio/commands.wav

class 'unknown' (0.384)
class 'yes' (1.000)
class 'no' (1.000)
class 'up' (1.000)
class 'down' (1.000)
class 'left' (1.000)
class 'left' (1.000)
class 'right' (1.000)
class 'on' (1.000)
class 'off' (1.000)
class 'stop' (1.000)
class 'go' (1.000)
class 'go' (1.000)
class 'silence' (0.639)
class 'silence' (0.576)
```

The numbers printed on the right are the classification probabilities between 0 and 1.

### Voice Activity Detection (VAD)

The voice activity model ([VAD MarbleNet](https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/asr/speech_classification/models.html#marblenet-vad)) is a binary model that outputs `background` or `speech`:

``` bash
$ examples/asr.py --model vad_marblenet --wav data/audio/commands.wav

class 'background' (0.969)
class 'background' (0.984)
class 'background' (0.987)
class 'speech' (0.997)
class 'speech' (1.000)
class 'speech' (1.000)
class 'speech' (0.998)
class 'background' (0.987)
class 'speech' (1.000)
class 'speech' (1.000)
class 'speech' (1.000)
class 'background' (0.988)
class 'background' (0.784)
```

## Natural Language Processing (NLP)

There are two samples included for NLP:

* [`examples/nlp.py`](examples/nlp.py) (intent/slot, text classification, token classification)
* [`examples/nlp_qa.py`](examples/nlp_qa.py) (question/answering)

These each use a [DistilBERT](https://arxiv.org/abs/1910.01108) model which has been fined-tuned for it's particular task.  For information about using the NLP APIs, please refer to [`jetson_voice/nlp.py`](jetson_voice/nlp.py) and see the samples above.

### Joint Intent/Slot Classification

Joint Intent and Slot classification is a task of classifying an Intent and detecting all relevant Slots (Entities) for this Intent in a query. For example, in the query: `What is the weather in Santa Clara tomorrow morning?`, we would like to classify the query as a `weather` Intent, and detect `Santa Clara` as a location slot and `tomorrow morning` as a date_time slot. 

Intents and Slots names are usually task specific and defined as labels in the training data.  The included intent/slot model was trained on the [NLU-Evaluation-Data](https://github.com/xliuhw/NLU-Evaluation-Data) dataset - you can find the various intent and slot classes that it supports [here](https://gist.github.com/dusty-nv/119474dfcf3bfccfbb8428951a64cd23).  They are common things that you might ask a virtual assistant:

```
$ examples/nlp.py --model distilbert_intent

Enter intent_slot query, or Q to quit:

> What is the weather in Santa Clara tomorrow morning?

{'intent': 'weather_query',
 'score': 0.7165476,
 'slots': [{'score': 0.6280392, 'slot': 'place_name', 'text': 'Santa'},
           {'score': 0.61760694, 'slot': 'place_name', 'text': 'Clara'},
           {'score': 0.5439486, 'slot': 'date', 'text': 'tomorrow'},
           {'score': 0.4520608, 'slot': 'date', 'text': 'morning'}]}

> Set an alarm for 730am

{'intent': 'alarm_set',
 'score': 0.5713072,
 'slots': [{'score': 0.40017933, 'slot': 'time', 'text': '730am'}]}

> Turn up the volume

{'intent': 'audio_volume_up', 'score': 0.33523008, 'slots': []}

> What is my schedule for tomorrow?

{'intent': 'calendar_query',
 'score': 0.37434494,
 'slots': [{'score': 0.5732627, 'slot': 'date', 'text': 'tomorrow'}]}

> Order a pepperoni pizza from domino's

{'intent': 'takeaway_order',
 'score': 0.50629586,
 'slots': [{'score': 0.27558547, 'slot': 'food_type', 'text': 'pepperoni'},
           {'score': 0.2778827, 'slot': 'food_type', 'text': 'pizza'},
           {'score': 0.21785143, 'slot': 'business_name', 'text': 'dominos'}]}
	
> Where's the closest Starbucks?

{'intent': 'recommendation_locations',
 'score': 0.5438984,
 'slots': [{'score': 0.1604197, 'slot': 'place_name', 'text': 'Starbucks'}]}

```

### Text Classification

In this text classification example, we'll use the included sentiment analysis model that was trained on the [Standford Sentiment Treebank (SST-2)](https://nlp.stanford.edu/sentiment/index.html) dataset.  It will label queries as either positive or negative, along with their probability:

```
$ examples/nlp.py --model distilbert_sentiment

Enter text_classification query, or Q to quit:

> today was warm, sunny and beautiful out

{'class': 1, 'label': '1', 'score': 0.9985898}

> today was cold and rainy and not very nice

{'class': 0, 'label': '0', 'score': 0.99136007}
```

(class 0 is negative sentiment and class 1 is positive sentiment)

### Token Classification

Whereas text classification classifies entire queries, token classification classifies individual tokens (or words).  In this example, we'll be performing Named Entity Recognition (NER), which is the task of detecting and classifying key information (entities) in text. For example, in a sentence: `Mary lives in Santa Clara and works at NVIDIA`, we should detect that `Mary` is a person, `Santa Clara` is a location and `NVIDIA` is a company.

The included token classification model for NER was trained on the [Groningen Meaning Bank (GMB)](http://www.let.rug.nl/bjerva/gmb/about.php) and supports the following annotations in [IOB format](https://en.wikipedia.org/wiki/Inside%E2%80%93outside%E2%80%93beginning_(tagging)) (short for inside, outside, beginning)

* LOC = Geographical Entity
* ORG = Organization
* PER = Person
* GPE = Geopolitical Entity
* TIME = Time indicator
* MISC = Artifact, Event, or Natural Phenomenon

``` bash
$ examples/nlp.py --model distilbert_ner

Enter token_classification query, or Q to quit:
> Mary lives in Santa Clara and works at NVIDIA

Mary[B-PER 0.989] lives in Santa[B-LOC 0.998] Clara[I-LOC 0.996] and works at NVIDIA[B-ORG 0.967]

> Lisa's favorite place to climb in the summer is El Capitan in Yosemite National Park in California, U.S.

Lisa's[B-PER 0.995] favorite place to climb in the summer[B-TIME 0.996] is El[B-PER 0.577] Capitan[I-PER 0.483] 
in Yosemite[B-LOC 0.987] National[I-LOC 0.988] Park[I-LOC 0.98] in California[B-LOC 0.998], U.S[B-LOC 0.997].
```

### Question/Answering

Question/Answering (QA) works by supplying a context paragraph which the model then queries the best answer from.  The [`nlp_qa.py`](examples/nlp_qa.py) example allows you to select from several built-in context paragraphs (or supply your own) and to ask questions about these topics.  

The QA model is flexible and doesn't need re-trained on different topics, as it was trained on the [SQuAD](https://rajpurkar.github.io/SQuAD-explorer/) question/answering dataset which allows it to extract answers from a variety of contexts.  It essentially learns to identify the information most relevant to your query from the context passage, as opposed to learning the content itself.

``` bash
$ examples/nlp_qa.py 

Context:
The Amazon rainforest is a moist broadleaf forest that covers most of the Amazon basin of South America. 
This basin encompasses 7,000,000 square kilometres (2,700,000 sq mi), of which 5,500,000 square kilometres 
(2,100,000 sq mi) are covered by the rainforest. The majority of the forest is contained within Brazil, 
with 60% of the rainforest, followed by Peru with 13%, and Colombia with 10%.

Enter a question, C to change context, P to print context, or Q to quit:

> How big is the Amazon?

Answer: 7,000,000 square kilometres
Score:  0.24993503093719482

> which country has the most?

Answer: Brazil
Score:  0.5964332222938538
```

To change the topic or create one of your own, enter `C`:

```
Enter a question, C to change context, P to print context, or Q to quit:
> C

Select from one of the following topics, or enter your own context paragraph:
   1. Amazon
   2. Geology
   3. Moon Landing
   4. Pi
   5. Super Bowl 55
> 3

Context:
The first manned Moon landing was Apollo 11 on July, 20 1969. The first human to step on the Moon was 
astronaut Neil Armstrong followed second by Buzz Aldrin. They landed in the Sea of Tranquility with their 
lunar module the Eagle. They were on the lunar surface for 2.25 hours and collected 50 pounds of moon rocks.

Enter a question, C to change context, P to print context, or Q to quit:

> Who was the first man on the moon?

Answer: Neil Armstrong
Score:  0.39105066657066345
```

## Text-to-Speech (TTS)

The text-to-speech service uses an ensemble of two models:  FastPitch to generate MEL spectrograms from text, and HiFiGAN as the vocoder (female English voice).  For information about using the TTS APIs, please refer to [`jetson_voice/tts.py`](jetson_voice/tts.py) and see [`examples/tts.py`](examples/tts.py)

The [`examples/tts.py`](examples/tts.py) app can output the audio to a speaker, wav file, or sequence of wav files.  Run it with `--list-devices` to get a list of your audio devices.

``` bash
$ examples/tts.py --output-device 11 --output-wav data/audio/tts_test

> The weather tomorrow is forecast to be warm and sunny with a high of 83 degrees.

Run 0 -- Time to first audio: 1.820s. Generated 5.36s of audio. RTFx=2.95.
Run 1 -- Time to first audio: 0.232s. Generated 5.36s of audio. RTFx=23.15.
Run 2 -- Time to first audio: 0.230s. Generated 5.36s of audio. RTFx=23.31.
Run 3 -- Time to first audio: 0.231s. Generated 5.36s of audio. RTFx=23.25.
Run 4 -- Time to first audio: 0.230s. Generated 5.36s of audio. RTFx=23.36.
Run 5 -- Time to first audio: 0.230s. Generated 5.36s of audio. RTFx=23.35.

Wrote audio to data/audio/tts_test/0.wav

Enter text, or Q to quit:
> Sally sells seashells by the seashore.

Run 0 -- Time to first audio: 0.316s. Generated 2.73s of audio. RTFx=8.63.
Run 1 -- Time to first audio: 0.126s. Generated 2.73s of audio. RTFx=21.61.
Run 2 -- Time to first audio: 0.127s. Generated 2.73s of audio. RTFx=21.51.
Run 3 -- Time to first audio: 0.126s. Generated 2.73s of audio. RTFx=21.68.
Run 4 -- Time to first audio: 0.126s. Generated 2.73s of audio. RTFx=21.68.
Run 5 -- Time to first audio: 0.126s. Generated 2.73s of audio. RTFx=21.61.

Wrote audio to data/audio/tts_test/1.wav
```

#### TTS Audio Samples

* [Weather forecast](data/audio/tts_examples/0.wav) (wav)
* [Sally sells seashells](data/audio/tts_examples/1.wav) (wav)


## Tests

There is an automated test suite included that will verify all of the models are working properly.  You can run it with the `tests/run_tests.py` script:

``` bash
$ tests/run_tests.py

----------------------------------------------------
 TEST SUMMARY
----------------------------------------------------
test_asr.py (quartznet)                  PASSED
test_asr.py (quartznet_greedy)           PASSED
test_asr.py (matchboxnet)                PASSED
test_asr.py (vad_marblenet)              PASSED
test_nlp.py (distilbert_qa_128)          PASSED
test_nlp.py (distilbert_qa_384)          PASSED
test_nlp.py (distilbert_intent)          PASSED
test_nlp.py (distilbert_sentiment)       PASSED
test_nlp.py (distilbert_ner)             PASSED
test_tts.py (fastpitch_hifigan)          PASSED

passed 10 of 10 tests
saved logs to data/tests/logs/20210610_1512
```

The logs of the individual tests are printed to the screen and saved to a timestamped directory.





================================================
FILE: docker/build.sh
================================================
#!/usr/bin/env bash

ROS_DISTRO=${1:-"none"}
BASE_IMAGE=$2
NEMO_VERSION="1.0.0rc1"

# find container tag from os version
source docker/tag.sh

if [ $ARCH = "aarch64" ]; then
	if [ -z $BASE_IMAGE ]; then
		if [ $L4T_VERSION = "32.7.1" ]; then
			BASE_IMAGE="l4t-ml:r32.7.1-py3"
			#BASE_IMAGE="nvcr.io/nvidia/l4t-ml:r32.7.1-py3"
			NEMO_VERSION="1.6.2"
		elif [ $L4T_VERSION = "32.6.1" ]; then
			BASE_IMAGE="nvcr.io/nvidia/l4t-ml:r32.6.1-py3"
		elif [ $L4T_VERSION = "32.5.0" ] || [ $L4T_VERSION = "32.5.1" ]; then
			BASE_IMAGE="nvcr.io/nvidia/l4t-ml:r32.5.0-py3"
		elif [ $L4T_VERSION = "32.4.4" ]; then
			BASE_IMAGE="nvcr.io/nvidia/l4t-ml:r32.4.4-py3"
		elif [ $L4T_VERSION = "32.4.3" ]; then
			BASE_IMAGE="nvcr.io/nvidia/l4t-ml:r32.4.3-py3"
		elif [ $L4T_VERSION = "32.4.2" ]; then
			BASE_IMAGE="nvcr.io/nvidia/l4t-ml:r32.4.2-py3"
		else
			echo "cannot build jetson-voice docker container for L4T R$L4T_VERSION"
			echo "please upgrade to the latest JetPack, or build jetson-voice natively"
			exit 1
		fi
	fi
elif [ $ARCH = "x86_64" ]; then
	BASE_IMAGE=${BASE_IMAGE:-"nvcr.io/nvidia/nemo:$NEMO_VERSION"}
fi

VOICE_CONTAINER="$CONTAINER_NAME:$TAG"
VOICE_CONTAINER_BASE="$VOICE_CONTAINER-base"

# build the base container
echo "CONTAINER=$VOICE_CONTAINER_BASE"
echo "BASE_IMAGE=$BASE_IMAGE"

sudo docker build -t $VOICE_CONTAINER_BASE -f Dockerfile.$ARCH \
          --build-arg BASE_IMAGE=$BASE_IMAGE \
		--build-arg NEMO_VERSION=$NEMO_VERSION \
		.
		
# build the runtime container
echo "CONTAINER=$VOICE_CONTAINER"
echo "BASE_IMAGE=$VOICE_CONTAINER_BASE"

sudo docker build -t $VOICE_CONTAINER -f Dockerfile.runtime \
          --build-arg BASE_IMAGE=$VOICE_CONTAINER_BASE \
		.

# build ROS version of container
if [[ "$ROS_DISTRO" != "none" ]] && [[ $ARCH = "aarch64" ]]; then
	ROS_CONTAINER="$VOICE_CONTAINER-ros-$ROS_DISTRO"
	ROS_CONTAINER_BASE="$ROS_CONTAINER-base"
	
	# copy files needed to build ROS container
	if [ ! -d "packages/" ]; then
		cp -r docker/containers/packages packages
	fi
	
	# opencv.csv mounts files that preclude us installing different version of opencv
	# temporarily disable the opencv.csv mounts while we build the container
	CV_CSV="/etc/nvidia-container-runtime/host-files-for-container.d/opencv.csv"

	if [ -f "$CV_CSV" ]; then
		sudo mv $CV_CSV $CV_CSV.backup
	fi
	
	# build ROS on top of jetson-voice 
	echo "CONTAINER=$ROS_CONTAINER_BASE"
	echo "BASE_IMAGE=$VOICE_CONTAINER_BASE"

	sudo docker build -t $ROS_CONTAINER_BASE -f docker/containers/Dockerfile.ros.$ROS_DISTRO \
          --build-arg BASE_IMAGE=$VOICE_CONTAINER_BASE \
		.
	
	# install jetson_voice_ros package
	echo "CONTAINER=$ROS_CONTAINER"
	echo "BASE_IMAGE=$ROS_CONTAINER_BASE"

	sudo docker build -t $ROS_CONTAINER -f Dockerfile.ros \
          --build-arg BASE_IMAGE=$ROS_CONTAINER_BASE \
		.
		
	# restore opencv.csv mounts
	if [ -f "$CV_CSV.backup" ]; then
		sudo mv $CV_CSV.backup $CV_CSV
	fi
fi

================================================
FILE: docker/push.sh
================================================
#!/usr/bin/env bash

ROS_DISTRO=${1:-"foxy"}
source docker/tag.sh

# push image
push() 
{
	local remote_image="dustynv/$1"
	
	sudo docker rmi $remote_image
	sudo docker tag $1 $remote_image
	
	echo "pushing image $remote_image"
	sudo docker push $remote_image
	echo "done pushing image $remote_image"
}

push "$CONTAINER_NAME:$TAG"

ROS_CONTAINER="$CONTAINER_NAME:$TAG-ros-$ROS_DISTRO"
push "$ROS_CONTAINER"

================================================
FILE: docker/run.sh
================================================
#!/usr/bin/env bash
#
# Start an instance of the jetson-voice docker container.
# See below or run this script with -h or --help to see usage options.
#
# This script should be run from the root dir of the jetson-voice project:
#
#     $ cd /path/to/your/jetson-voice
#     $ docker/run.sh
#

show_help() {
    echo " "
    echo "usage: Starts the Docker container and runs a user-specified command"
    echo " "
    echo "   ./docker/run.sh --container DOCKER_IMAGE"
    echo "                   --volume HOST_DIR:MOUNT_DIR"
    echo "                   --run RUN_COMMAND"
    echo " "
    echo "args:"
    echo " "
    echo "   --help                       Show this help text and quit"
    echo " "
    echo "   -c, --container DOCKER_IMAGE Specifies the name of the Docker container"
    echo "                                image to use (default: 'jetson-voice')"
    echo " "
    echo "   --ros ROS_DISTRO Starts the version of the container using the"
    echo "                    specified ROS distro (or foxy if not specified)"
    echo "                    This is overridden by the --container argument"
    echo " "
    echo "   -d, --dev  Runs the container in development mode, where the source"
    echo "              files are mounted into the container dynamically, so they"
    echo "              can more easily be edited from the host machine."
    echo " "
    echo "   -v, --volume HOST_DIR:MOUNT_DIR Mount a path from the host system into"
    echo "                                   the container.  Should be specified as:"
    echo " "
    echo "                                      -v /my/host/path:/my/container/path"
    echo " "
    echo "                                   (these should be absolute paths)"
    echo " "
    echo "   -r, --run RUN_COMMAND  Command to run once the container is started."
    echo "                          Note that this argument must be invoked last,"
    echo "                          as all further arguments will form the command."
    echo "                          If no run command is specified, an interactive"
    echo "                          terminal into the container will be provided."
    echo " "
}

die() {
    printf '%s\n' "$1"
    show_help
    exit 1
}

# find container tag from os version
source docker/tag.sh

# where the project resides inside docker
DOCKER_ROOT="/jetson-voice"	

# generate mount commands
DATA_VOLUME="--volume $PWD/data:$DOCKER_ROOT/data"
DEV_VOLUME=""

# parse user arguments
USER_VOLUME=""
USER_COMMAND=""

while :; do
    case $1 in
        -h|-\?|--help)
            show_help    # Display a usage synopsis.
            exit
            ;;
        -c|--container)       # Takes an option argument; ensure it has been specified.
            if [ "$2" ]; then
                CONTAINER_IMAGE=$2
                shift
            else
                die 'ERROR: "--container" requires a non-empty option argument.'
            fi
            ;;
        --container=?*)
            CONTAINER_IMAGE=${1#*=} # Delete everything up to "=" and assign the remainder.
            ;;
        --container=)         # Handle the case of an empty --image=
            die 'ERROR: "--container" requires a non-empty option argument.'
            ;;
	   --ros)
            if [ "$2" ]; then
                ROS_DISTRO=$2
                shift
            else
                ROS_DISTRO="foxy"
            fi
            ;;
        --ros=?*)
            ROS_DISTRO=${1#*=} # Delete everything up to "=" and assign the remainder.
            ;;
        --ros=)         # Handle the case of an empty --image=
            ROS_DISTRO="foxy"
            ;;
	   -d|--dev)
            DEV_VOLUME="--volume $PWD/jetson_voice:$DOCKER_ROOT/jetson_voice --volume $PWD/examples:$DOCKER_ROOT/examples --volume $PWD/scripts:$DOCKER_ROOT/scripts --volume $PWD/tests:$DOCKER_ROOT/tests"
            ;;
        -v|--volume)
            if [ "$2" ]; then
                USER_VOLUME=" -v $2 "
                shift
            else
                die 'ERROR: "--volume" requires a non-empty option argument.'
            fi
            ;;
        --volume=?*)
            USER_VOLUME=" -v ${1#*=} " # Delete everything up to "=" and assign the remainder.
            ;;
        --volume=)         # Handle the case of an empty --image=
            die 'ERROR: "--volume" requires a non-empty option argument.'
            ;;
        -r|--run)
            if [ "$2" ]; then
                shift
                USER_COMMAND=" $@ "
            else
                die 'ERROR: "--run" requires a non-empty option argument.'
            fi
            ;;
        --)              # End of all options.
            shift
            break
            ;;
        -?*)
            printf 'WARN: Unknown option (ignored): %s\n' "$1" >&2
            ;;
        *)               # Default case: No more options, so break out of the loop.
            break
    esac

    shift
done

# select the container, unless --container was explicitly specified
if [ -z "$CONTAINER_IMAGE" ]; then
	CONTAINER_IMAGE="$CONTAINER_NAME:$TAG"

	if [ -n "$ROS_DISTRO" ]; then
		CONTAINER_IMAGE="$CONTAINER_NAME:$TAG-ros-$ROS_DISTRO"
	fi

	CONTAINER_REMOTE_IMAGE="dustynv/$CONTAINER_IMAGE"

	# check for local image
	if [[ "$(sudo docker images -q $CONTAINER_IMAGE 2> /dev/null)" == "" ]]; then
		CONTAINER_IMAGE=$CONTAINER_REMOTE_IMAGE
	fi
fi

echo "CONTAINER:     $CONTAINER_IMAGE"
echo "DEV_VOLUME:    $DEV_VOLUME"
echo "DATA_VOLUME:   $DATA_VOLUME"
echo "USER_VOLUME:   $USER_VOLUME"
echo "USER_COMMAND:  $USER_COMMAND"

MOUNTS="\
--device /dev/snd \
--device /dev/bus/usb \
--volume /etc/timezone:/etc/timezone:ro \
--volume /etc/localtime:/etc/localtime:ro \
$DEV_VOLUME \
$DATA_VOLUME \
$USER_VOLUME"

if [ $ARCH = "aarch64" ]; then

	sudo docker run --runtime nvidia -it --rm \
		--name=$CONTAINER_NAME \
		--network host \
		$MOUNTS $CONTAINER_IMAGE $USER_COMMAND
	    
elif [ $ARCH = "x86_64" ]; then

	sudo docker run --gpus all -it --rm \
		--name=$CONTAINER_NAME \
		--network=host \
		--shm-size=8g \
		--ulimit memlock=-1 \
		--ulimit stack=67108864 \
		$MOUNTS $CONTAINER_IMAGE $USER_COMMAND
		
fi


================================================
FILE: docker/tag.sh
================================================
#!/usr/bin/env bash

# find OS version
source scripts/os_version.sh

if [ $ARCH = "aarch64" ]; then
	TAG="r$L4T_VERSION"
	
	if [ $L4T_VERSION = "32.5.1" ] || [ $L4T_VERSION = "32.5.2" ]; then
		TAG="r32.5.0"
	fi	
elif [ $ARCH = "x86_64" ]; then
	TAG="$ARCH"
else
	echo "unsupported architecture:  $ARCH"
	exit 1
fi

CONTAINER_NAME="jetson-voice"




================================================
FILE: examples/asr.py
================================================
#!/usr/bin/env python3
# coding: utf-8

import sys

from jetson_voice import ASR, AudioInput, ConfigArgParser, list_audio_devices
    
    
parser = ConfigArgParser()

parser.add_argument('--model', default='quartznet', type=str, help='path to model, service name, or json config file')
parser.add_argument('--wav', default=None, type=str, help='path to input wav/ogg/flac file')
parser.add_argument('--mic', default=None, type=str, help='device name or number of input microphone')
parser.add_argument('--list-devices', action='store_true', help='list audio input devices')

args = parser.parse_args()
print(args)
    
# list audio devices
if args.list_devices:
    list_audio_devices()
    sys.exit()
    
# load the model
asr = ASR(args.model)

# create the audio input stream
stream = AudioInput(wav=args.wav, mic=args.mic, 
                     sample_rate=asr.sample_rate, 
                     chunk_size=asr.chunk_size)

# run transcription
for samples in stream:
    results = asr(samples)
    
    if asr.classification:
        print(f"class '{results[0]}' ({results[1]:.3f})")
    else:
        for transcript in results:
            print(transcript['text'])
            
            if transcript['end']:
                print('')
                
print('\naudio stream closed.')
    

================================================
FILE: examples/assistant.py
================================================
#!/usr/bin/env python3
# coding: utf-8

import sys
import pprint

from jetson_voice import (
    ASR, NLP, TTS, 
    AudioInput, AudioOutput, list_audio_devices,
    ConfigArgParser
)
       
parser = ConfigArgParser()

parser.add_argument('--asr-model', default='quartznet', type=str, help='ASR model')
parser.add_argument('--nlp-model', default='distilbert_intent', type=str, help='NLP model')
parser.add_argument('--tts-model', default='fastpitch_hifigan', type=str, help='TTS model')
parser.add_argument('--wav', default=None, type=str, help='path to input wav/ogg/flac file')
parser.add_argument('--mic', default=None, type=str, help='device name or number of input microphone')
parser.add_argument('--output-device', default=None, type=str, help='device name or number of audio output')
parser.add_argument('--list-devices', action='store_true', help='list audio input devices')

args = parser.parse_args()
print(args)
    
# list audio devices
if args.list_devices:
    list_audio_devices()
    sys.exit()
    
# load the models
tts = TTS(args.tts_model)
asr = ASR(args.asr_model, add_punctuation=False)
nlp = NLP(args.nlp_model)

if asr.classification:
    raise ValueError(f"'{args.asr_model}' is a classification model - must use a transcription model for agent")

if nlp.config.type != 'intent_slot':
    raise ValueError(f"'{args.nlp_model}' has type '{nlp.config.type}' - the agent requires an intent_slot model")
    
# create the audio streams
audio_input = AudioInput(wav=args.wav, mic=args.mic, 
                         sample_rate=asr.sample_rate, 
                         chunk_size=asr.chunk_size)

audio_output = AudioOutput(device=args.output_device,
                           sample_rate=tts.sample_rate)


def get_slot(results, name, default='', threshold=0, merge=True):
    """
    Retrieve a slot by name from the intent/slot results.
    The name can be a list of names, and any of them will be matched.
    Only slots with a score above the threshold will be returned.
    If merge is true, all slots by that name will be combined.
    If merge is false, the first matching slot will be returned.
    """
    if isinstance(name, str):
        name = [name]
        
    slots = []

    for slot in results['slots']:
        if any(slot['slot'] == n for n in name) and slot['score'] >= threshold:
            slots.append(slot['text'])
            
    if len(slots) == 0:
        return default
        
    if len(slots) > 1 and merge:
        return ' '.join(slots)
        
    return slots[0]
      
      
def generate_response(query):
    results = nlp(query)
    pprint.pprint(results)
    
    intent = results['intent']
    
    if intent == 'general_praise':
        return "Why thank you very much!"
        
    elif intent == 'weather_query':
        place = get_slot(results, 'place_name')
        date = get_slot(results, 'date')
        
        response = "The weather "
        
        if place: response += 'in ' + place + ' '
        if date:  response += date + ' '
        
        return response + "is forecast to be sunny with a high of 78 degrees."
        
    elif intent == 'recommendation_locations':
        place = get_slot(results, ['place_name', 'business_name'])
        
        if not place:
            return "Please ask again with the name of a store or restaurant."
          
        return f"{place} is located 1 mile away at 1 2 3 Main Street."
        
    return "I'm sorry, I don't understand."
    
# run agent
for input_samples in audio_input:
    transcripts = asr(input_samples)

    for transcript in transcripts:
        print(transcript['text'])
        
        if not transcript['end']:
            continue
            
        print('')
        
        response = generate_response(transcript['text'])
        print(response)
        
        audio_output.write(tts(response))

    """
    if transcripts[0] != 'unknown' and transcripts[1] != 'silence':
        response = generate_response(transcripts[0])
        print(response)
        
        audio_output.write(tts(response))
    """

================================================
FILE: examples/nlp.py
================================================
#!/usr/bin/env python3
# coding: utf-8

import sys
import pprint
import readline

from jetson_voice import NLP, ConfigArgParser


parser = ConfigArgParser()
parser.add_argument('--model', default='distilbert_sentiment', type=str)
args = parser.parse_args()
print(args)

# load the model
model = NLP(args.model)

# QA models should run the nlp_qa.py example
type = model.config.type

if type == 'qa':
    raise ValueError("please run Question/Answer models with the nlp_qa.py sample")


while True:
    print(f'\nEnter {type} query, or Q to quit:')
    query = input('> ')
    
    if query.upper() == 'Q':
        sys.exit()
    
    print('')
    
    results = model(query)
        
    if type == 'intent_slot' or type == 'text_classification':
        pprint.pprint(results)
    
    elif type == 'token_classification':
        print(f'{model.tag_string(query, results, scores=True)}')
        

================================================
FILE: examples/nlp_qa.py
================================================
#!/usr/bin/env python3
# coding: utf-8

import sys
import readline

from jetson_voice import QuestionAnswer, ConfigArgParser

parser = ConfigArgParser()
parser.add_argument('--model', default='distilbert_qa_384', type=str)
parser.add_argument('--top_k', default=1, type=int, help='show the top N answers (default 1)')
args = parser.parse_args()
print(args)

model = QuestionAnswer(args.model)  # load the QA model

builtin_context = {
    "Amazon" : "The Amazon rainforest is a moist broadleaf forest that covers most of the Amazon basin of South America. "
               "This basin encompasses 7,000,000 square kilometres (2,700,000 sq mi), of which 5,500,000 square kilometres "
               "(2,100,000 sq mi) are covered by the rainforest. The majority of the forest is contained within Brazil, "
               "with 60% of the rainforest, followed by Peru with 13%, and Colombia with 10%.",
    
    "Geology" : "There are three major types of rock: igneous, sedimentary, and metamorphic. Igneous rocks are formed from "
                "melted rock deep inside the Earth. Sedimentary rocks are compressed layers of sand, silt, dead plants, and "
                "animal skeletons. Metamorphic rocks are other rocks that are changed by heat and pressure underground.",
    
    "Moon Landing" : "The first manned Moon landing was Apollo 11 on July, 20 1969. The first human to step on the Moon was "
                     "astronaut Neil Armstrong followed second by Buzz Aldrin. They landed in the Sea of Tranquility with their "
                     "lunar module the Eagle. They were on the lunar surface for 2.25 hours and collected 50 pounds of moon rocks.",
           
    "Pi" : "Some people have said that Pi is tasty but there should be a value for Pi, and the value for Pi is around 3.14. "
           "Pi is the ratio of a circle's circumference to it's diameter. The constant Pi was first calculated by Archimedes "
           "in ancient Greece around the year 250 BC.",
           
    "Super Bowl 55" : "Super Bowl 55 took place on February 7, 2021 in Tampa, Florida between the Kansas City Chiefs and "
                      "the Tampa Bay Buccaneers.  The Tampa Bay Buccaneers won by a score of 31 to 9. In his first season "
                      "with Tampa Bay, it was quarterback Tom Brady's seventh Super Bowl win in nine appearances.",
}

context = builtin_context['Amazon']

def print_context():
    print('\nContext:')
    print(context)
    
def parse_commands(entry):
    """
    Parse 'C' command for changing context, 'P' to print context, and 'Q' for quit.
    Returns true if a command was entered, otherwise false.
    """
    global context

    if entry == 'C':
        print('\nSelect from one of the following topics, or enter your own context paragraph:')
        for idx, key in enumerate(builtin_context):
            print(f'   {idx+1}. {key}')
        entry = input('> ')
        try:  # try parsing as a number
            num = int(entry)
            if num > 0 and num <= len(builtin_context):
                context = builtin_context[list(builtin_context.keys())[num-1]]
            else:
                print('Invalid entry')
        except:  # try looking up topic name, otherwise custom paragraph
            if entry in builtin_context:
                context = builtin_context[entry.lower()]
            else:
                context = entry
                
        print_context()
        return True
        
    elif entry == 'P':
        print_context()
        return True
    elif entry == 'Q':
        sys.exit()
        
    return False
    
print_context()

while True:
    print('\nEnter a question, C to change context, P to print context, or Q to quit:')
    entry = input('> ')
    
    if parse_commands(entry.upper()):
        continue
    
    query = {
        'context' : context,
        'question' : entry
    }
    
    results = model(query, top_k=args.top_k)
    
    if args.top_k == 1:
        results = [results]
        
    for result in results:
        print('\nAnswer:', result['answer'])
        print('Score: ', result['score'])
        

================================================
FILE: examples/tts.py
================================================
#!/usr/bin/env python3
# coding: utf-8

import os
import sys
import time
import readline

from jetson_voice import TTS, ConfigArgParser, AudioOutput, list_audio_devices
from soundfile import SoundFile


parser = ConfigArgParser()

parser.add_argument('--model', default='fastpitch_hifigan', type=str)
parser.add_argument('--warmup', default=5, type=int, help='the number of warmup runs')
parser.add_argument("--output-device", default=None, type=str, help='output audio device to use')
parser.add_argument("--output-wav", default=None, type=str, help='output directory or wav file to write to')
parser.add_argument('--list-devices', action='store_true', help='list audio input devices')

args = parser.parse_args()
print(args)

# list audio devices
if args.list_devices:
    list_audio_devices()
    sys.exit()
    
# load the model
tts = TTS(args.model)

# open output audio device
if args.output_device:
    audio_device = AudioOutput(args.output_device, tts.sample_rate)

# create output wav directory
if args.output_wav:
    wav_is_dir = len(os.path.splitext(args.output_wav)[1]) == 0
    wav_count = 0
    if wav_is_dir and not os.path.exists(args.output_wav):
        os.makedirs(args.output_wav)


while True:
    print(f'\nEnter text, or Q to quit:')
    text = input('> ')
    
    if text.upper() == 'Q':
        sys.exit()
    
    print('')
    
    # run the TTS
    for run in range(args.warmup+1):
        start = time.perf_counter()
        audio = tts(text)
        stop = time.perf_counter()
        latency = stop-start
        duration = audio.shape[0]/tts.sample_rate
        print(f"Run {run} -- Time to first audio: {latency:.3f}s. Generated {duration:.2f}s of audio. RTFx={duration/latency:.2f}.")
        
    # output the audio
    if args.output_device:
        audio_device.write(audio)
    
    if args.output_wav:
        wav_path = os.path.join(args.output_wav, f'{wav_count}.wav') if wav_is_dir else args.output_wav
        wav = SoundFile(wav_path, mode='w', samplerate=tts.sample_rate, channels=1)
        wav.write(audio)
        wav.close()
        wav_count += 1
        print(f"\nWrote audio to {wav_path}")

    

================================================
FILE: jetson_voice/__init__.py
================================================
#!/usr/bin/env python3
# coding: utf-8

from .utils import (
    find_resource, list_models, global_config, ConfigDict, ConfigArgParser,
    list_audio_devices, list_audio_inputs, list_audio_outputs, AudioInput, AudioOutput 
)

from .asr import ASR, ASRService
from .tts import TTS, TTSService

from .nlp import (NLP,
    IntentSlot, IntentSlotService, 
    QuestionAnswer, QuestionAnswerService,
    TextClassification, TextClassificationService,
    TokenClassification, TokenClassificationService,
)

from .auto import AutoModel

__version__ = global_config.version


================================================
FILE: jetson_voice/asr.py
================================================
#!/usr/bin/env python3
# coding: utf-8

from jetson_voice.utils import load_resource


def ASR(resource, *args, **kwargs):
    """
    Loads a streaming ASR service or model.
    See the ASRService class for the signature that implementations use.
    """
    factory_map = {
        'riva' : 'jetson_voice.backends.riva.RivaASRService',
        'tensorrt' : 'jetson_voice.models.asr.ASREngine',
        'onnxruntime' : 'jetson_voice.models.asr.ASREngine'
    }
    
    return load_resource(resource, factory_map, *args, **kwargs)

    
class ASRService():
    """
    Streaming ASR service base class.
    """
    def __init__(self, config, *args, **kwargs):
        self.config = config
        
    def __call__(self, samples):
        """
        Transcribe streaming audio samples to text, returning the running phrase.
        Phrases are broken up when a break in the audio is detected (i.e. end of sentence)
        
        Parameters:
          samples (array) -- Numpy array of audio samples.

        Returns a list[dict] of the running transcripts with the following keys:
        
          text (string) -- the transcript of the current sentence
          words (list[dict]) -- a list of word dicts that make up the sentence
          end (bool) -- if true, end-of-sentence due to silence
          
        Each transcript represents one phrase/sentence.  When a sentence has been determined
        to be ended, it will be marked with end=True.  Multiple sentence transcripts can be 
        returned if one just ended and another is beginning. 
        """
        pass
    
    @property
    def classification(self):
        """
        Returns true if this is an ASR classification model (e.g. for VAD or keyword spotting)
        Otherwise, this is an ASR transcription model that converts audio to text.
        """
        return False
        
    @property
    def sample_rate(self):
        """
        The sample rate that the model runs at (in Hz)
        Input audio should be resampled to this rate.
        """
        pass
    
    @property
    def frame_length(self):
        """
        Duration in seconds per frame / chunk.
        """
        pass
        
    @property
    def chunk_size(self):
        """
        Number of samples per frame/chunk (equal to frame_length * sample_rate)
        """
        pass
        
        
if __name__ == "__main__":

    from jetson_voice import list_audio_devices, AudioInput, ConfigArgParser
    import sys
    
    parser = ConfigArgParser()
    
    parser.add_argument('--model', default='quartznet', type=str, help='path to model, service name, or json config file')
    parser.add_argument('--wav', default=None, type=str, help='path to input wav file')
    parser.add_argument('--mic', default=None, type=str, help='device name or number of input microphone')
    parser.add_argument('--list-devices', action='store_true', help='list audio input devices')
    
    args = parser.parse_args()
    print(args)
    
    # list audio devices
    if args.list_devices:
        list_audio_devices()
        sys.exit()
        
    # load the model
    asr = ASR(args.model)
    
    # create the audio input stream
    stream = AudioInput(wav=args.wav, mic=args.mic, 
                         sample_rate=asr.sample_rate, 
                         chunk_size=asr.chunk_size)
    
    # run transcription
    for samples in stream:
        #samples = audio_to_float(samples)
        #print(f'samples {samples.shape} ({audio_db(samples):.1f} dB)')
        results = asr(samples)
        
        if asr.classification:
            print(f"class '{results[0]}' ({results[1]:.3f})")
        else:
            for transcript in results:
                print(transcript['text'])
                
                if transcript['end']:
                    print('')
                    
    print('\naudio stream closed.')
    

================================================
FILE: jetson_voice/auto.py
================================================
#!/usr/bin/env python3
# coding: utf-8

from jetson_voice.asr import ASR
from jetson_voice.nlp import IntentSlot, QuestionAnswer, TextClassification, TokenClassification
from jetson_voice.tts import TTS

from jetson_voice.utils import load_resource


def AutoModel(resource, domain=None, *args, **kwargs):
    """
    Factory for automatically loading models and services.
    First the config is loaded and the type is checked.
    Then the correct instance for the resource is created.
    
    If a domain string is supplied (e.g. 'asr', 'nlp', 'tts'),
    then only resources from that domain will be created.
    """
    type_map = {
        # models
        'asr' : (ASR, 'asr'),
        'asr_classification' : (ASR, 'asr'),
        'intent_slot' : (IntentSlot, 'nlp'),
        'qa' : (QuestionAnswer, 'nlp'),
        'text_classification' : (TextClassification, 'nlp'),
        'token_classification' : (TokenClassification, 'nlp'),
        'tts': (TTS, 'tts'),
        
        # services
        'jarvis_asr' : (ASR, 'asr')
    }

    config = load_resource(resource, None, *args, **kwargs)
    
    if 'type' not in config:
        raise ValueError(f"'type' setting missing from config '{config.path}'")
        
    if config.type not in type_map:
        raise ValueError(f"'{config.path}' has invalid 'type' ({config.type})")
    
    if domain:
        if type_map[config.type][1] != domain.lower():
            raise ValueError(f"invalid model selected - '{config.path}' has domain '{type_map[config.type][1]}', but AutoModel() was called with domain={domain}")
            
    return type_map[config.type][0](config, *args, **kwargs)


================================================
FILE: jetson_voice/backends/onnxruntime/__init__.py
================================================
#!/usr/bin/env python3
# coding: utf-8

from .ort_model import OnnxRuntimeModel



================================================
FILE: jetson_voice/backends/onnxruntime/ort_model.py
================================================
#!/usr/bin/env python3
# coding: utf-8

import os
import logging

# for some reason if PyCUDA isn't initialized before OnnxRuntime
# and TensorRT is also used, it makes TensorRT error
import pycuda.driver as cuda
import pycuda.autoinit

import numpy as np
import onnxruntime as ort


class OnnxRuntimeModel:
    """
    Base class for OnnxRuntime models.
    """
    def __init__(self, config, *args, **kwargs):
        """
        Load an ONNX Runtime model.
        """
        self.config = config
        
        logging.info(f"loading ONNX model '{self.config.model_path}' with onnxruntime")
        self.model = ort.InferenceSession(config.model_path, providers=['CUDAExecutionProvider'])
        logging.info(f"loaded ONNX model '{self.config.model_path}' with onnxruntime")
        
        self.inputs = self.model.get_inputs()
        self.outputs = self.model.get_outputs()
        
        for idx, binding in enumerate(self.inputs):
            print('')
            print(f"input {idx} - {binding.name}")
            print(f"   shape: {binding.shape}")
            print(f"   type:  {binding.type}")
            print('')
 
    def execute(self, inputs, return_dict=False, **kwargs):
        """
        Run the DNN model in TensorRT.  The inputs are provided as numpy arrays in a list/tuple/dict.
        Note that run() doesn't perform any pre/post-processing - this is typically done in subclasses.
        
        Parameters:
          inputs (array, list[array], dict[array]) -- the network inputs as numpy array(s).
                         If there is only one input, it can be provided as a single numpy array.
                         If there are multiple inputs, they can be provided as numpy arrays in a
                         list, tuple, or dict.  Inputs in lists and tuples are assumed to be in the
                         same order as the input bindings.  Inputs in dicts should have keys with the
                         same names as the input bindings.
          return_dict (bool) -- If True, the results will be returned in a dict of numpy arrays, where the
                                keys are the names of the output binding names. By default, the results will 
                                be returned in a list of numpy arrays, in the same order as the output bindings.
          
        Returns the model output as a numpy array (if only one output), list[ndarray], or dict[ndarray].
        """
        if isinstance(inputs, np.ndarray):
            inputs = [inputs]
        
        assert len(inputs) == len(self.inputs)
        
        if isinstance(inputs, (list,tuple)):
            inputs = {self.inputs[i].name : input for i, input in enumerate(inputs)}
        elif not isinstance(inputs, dict):        
            raise ValueError(f"inputs must be a list, tuple, or dict (instead got type '{type(inputs).__name__}')")
            
        outputs = self.model.run(None, inputs)
        
        if return_dict:
            return {self.outputs[i].name : output for i, output in enumerate(outputs)}
            
        if len(outputs) == 1:
            return outputs[0]
        
        return outputs

================================================
FILE: jetson_voice/backends/riva/__init__.py
================================================
#!/usr/bin/env python3
# coding: utf-8

from .riva_asr import RivaASRService
from .riva_tts import RivaTTSService


================================================
FILE: jetson_voice/backends/riva/riva_asr.py
================================================
#!/usr/bin/env python3
# coding: utf-8

import os
import grpc
import queue
import threading
import logging

import riva_api.audio_pb2 as ra
import riva_api.riva_asr_pb2 as rasr
import riva_api.riva_asr_pb2_grpc as rasr_srv

from jetson_voice import ASRService
from jetson_voice.utils import audio_to_int16

    
class RivaASRService(ASRService):
    """
    Riva streaming ASR service.  
    """
    def __init__(self, config, *args, **kwargs):
        """
        Open a streaming channel to the Riva server for ASR.  This establishes a connection over GRPC 
        and sends/recieves the requests and responses asynchronously.  Incoming audio samples get put
        into a request queue that GRPC picks up, and a thread waits on responses to come in.
        """
        super(RivaASRService, self).__init__(config, *args, **kwargs)
        
        self.config.setdefault('server', 'localhost:50051')
        self.config.setdefault('sample_rate', 16000)
        self.config.setdefault('frame_length', 1.0)
        self.config.setdefault('request_timeout', 2.0)      # how long to wait for new audio to come in
        self.config.setdefault('response_timeout', 0.05)    # how long to wait for results from riva
        self.config.setdefault('language_code', 'en-US')
        self.config.setdefault('enable_automatic_punctuation', True)
        self.config.setdefault('top_k', 1)

        logging.info(f'Riva ASR service config:\n{self.config}')
        
        self.channel = grpc.insecure_channel(self.config.server)
        self.client = rasr_srv.RivaSpeechRecognitionStub(self.channel)
        
        self.recognition_config = rasr.RecognitionConfig(
            encoding = ra.AudioEncoding.LINEAR_PCM,
            sample_rate_hertz = self.config.sample_rate,
            language_code = self.config.language_code,
            max_alternatives = self.config.top_k,
            enable_word_time_offsets = True,
            enable_automatic_punctuation = self.config.enable_automatic_punctuation
        )

        self.streaming_config = rasr.StreamingRecognitionConfig(
            config = self.recognition_config,
            interim_results = True
        )
        
        self.request_queue = queue.Queue()
        self.request_queue.put(rasr.StreamingRecognizeRequest(streaming_config=self.streaming_config))
         
        self.responses = self.client.StreamingRecognize(self)
        self.responses_queue = queue.Queue()
        
        self.response_thread = threading.Thread(target=self.recieve_responses)
        self.response_thread.start()

    def __call__(self, samples):
        """
        Transcribe streaming audio samples to text, returning the running phrase.
        Phrases are broken up when a break in the audio is detected (i.e. end of sentence)
        
        Parameters:
          samples (array) -- Numpy array of audio samples.

        Returns a list[dict] of the running transcripts with the following keys:
        
          text (string) -- the transcript of the current sentence
          words (list[dict]) -- a list of word dicts that make up the sentence
          end (bool) -- if true, end-of-sentence due to silence
          
        Each transcript represents one phrase/sentence.  When a sentence has been determined
        to be ended, it will be marked with end=True.  Multiple sentence transcripts can be 
        returned if one just ended and another is beginning. 
        """
        samples = audio_to_int16(samples)

        self.request_queue.put(rasr.StreamingRecognizeRequest(audio_content=samples.tobytes()))
        
        transcripts = []
        
        while True:
            try:
                transcripts.append(self.responses_queue.get(block=True, timeout=self.config.response_timeout))
            except queue.Empty:
                break

        return transcripts
 
    def __next__(self):
        """
        Retrieve the next request containing audio samples to send to the Riva server.
        This is implemented using an iterator interface as that is what GRPC expects.
        """
        try:
            request = self.request_queue.get(block=True, timeout=self.config.request_timeout)
            return request
        except queue.Empty:
            logging.debug(f'{self.config.request_timeout} second timeout occurred waiting for audio samples, stopping Riva ASR service')
            raise StopIteration
        
    def recieve_responses(self):
        """
        Wait to recieve responses from the Riva server and parse them.
        """
        logging.debug('starting Riva ASR service response reciever thread')
        
        for response in self.responses:  # this is blocking
            if not response.results:
                continue

            result = response.results[0]

            if not result.alternatives:
                continue

            text = result.alternatives[0].transcript
            text = text.strip()
            
            if len(text) == 0:
                continue
                
            self.responses_queue.put({
                'text' : text,
                'end' : result.is_final
            })

        logging.debug('exiting Riva ASR service response reciever thread')
        
    @property
    def sample_rate(self):
        """
        The sample rate that the model runs at (in Hz)
        Input audio should be resampled to this rate.
        """
        return self.config.sample_rate
    
    @property
    def frame_length(self):
        """
        Duration in seconds per frame / chunk.
        """
        return self.config.frame_length
        
    @property
    def chunk_size(self):
        """
        Number of samples per frame/chunk (equal to frame_length * sample_rate)
        """
        return int(self.frame_length * self.sample_rate)



================================================
FILE: jetson_voice/backends/riva/riva_tts.py
================================================
#!/usr/bin/env python3
# coding: utf-8

import os
import grpc
import logging
import numpy as np

import riva_api.audio_pb2 as ra
import riva_api.riva_tts_pb2 as rtts
import riva_api.riva_tts_pb2_grpc as rtts_srv

from jetson_voice import TTSService

    
class RivaTTSService(TTSService):
    """
    Riva streaming TTS service.  
    """
    def __init__(self, config, *args, **kwargs):
        """
        Open a streaming channel to the Riva server for TTS.  This establishes a connection over GRPC 
        and sends/recieves the requests and responses.
        """
        super(RivaTTSService, self).__init__(config, *args, **kwargs)
        
        self.config.setdefault('server', 'localhost:50051')
        self.config.setdefault('sample_rate', 22050)        # ignored (will always be 22.05KHz)
        self.config.setdefault('voice_name', 'ljspeech')    # ignored
        self.config.setdefault('language_code', 'en-US')

        logging.info(f'Riva TTS service config:\n{self.config}')
        
        self.channel = grpc.insecure_channel(self.config.server)
        self.client = rtts_srv.RivaSpeechSynthesisStub(self.channel)

    def __call__(self, text):
        """
        Generate audio from text.
        
        Parameters:
          text (string) -- The phrase to convert to audio.

        Returns audio samples in a numpy array.
        """
        req = rtts.SynthesizeSpeechRequest()
        
        req.text = text
        req.language_code = self.config.language_code
        req.sample_rate_hz = self.config.sample_rate
        req.voice_name = self.config.voice_name
        req.encoding = ra.AudioEncoding.LINEAR_PCM

        resp = self.client.Synthesize(req)
        
        samples = np.frombuffer(resp.audio, dtype=np.float32)
        return samples
    
    @property
    def sample_rate(self):
        """
        Get the output sample rate (in Hz)
        """
        return self.config.sample_rate

================================================
FILE: jetson_voice/backends/tensorrt/__init__.py
================================================
#!/usr/bin/env python3
# coding: utf-8

from .trt_model import TRTModel



================================================
FILE: jetson_voice/backends/tensorrt/trt_binding.py
================================================
#!/usr/bin/env python3
# coding: utf-8

import logging
import tensorrt as trt

import pycuda.driver as cuda
import pycuda.autoinit


class Binding:
    """
    Represents an input/output tensor to the model.
    """
    def __init__(self, model, index):
        """
        Parameters:
          model (TRTModel) -- parent model instance
          index (int) -- index of the binding in the model
        """
        self.model = model
        self.index = index

        self.name  = model.trt_engine.get_binding_name(index)
        self.shape = tuple(model.trt_engine.get_binding_shape(index))
        self.dtype = model.trt_engine.get_binding_dtype(index)
        self.input = model.trt_engine.binding_is_input(index)
        self.size  = max(trt.volume(self.shape) * self.dtype.itemsize, 0)
        
        self.dynamic = (self.size <= 0)   
        self.profiles = []
            
        if self.input:
            for i in range(model.trt_engine.num_optimization_profiles):
                profile = model.trt_engine.get_profile_shape(i, index)
                self.profiles.append(dict(
                    min = profile[0],
                    opt = profile[1],
                    max = profile[2]))
        
        self.alloc()
          
    def alloc(self, shape=None):
        """
        Allocate memory for the binding. alloc() is called automatically when needed.
        If new shape is provided, it will update the internal state. 
        """
        if shape is not None:
            self.shape = shape
            
        self.size = trt.volume(self.shape) * self.dtype.itemsize
        
        if self.size <= 0:  # dynamic with shape not yet set
            self.host = None
            self.device = None
            return
            
        self.host = None if self.input else cuda.pagelocked_empty(self.shape, dtype=trt.nptype(self.dtype))
        self.device = cuda.mem_alloc(self.size)
        
    def set_shape(self, shape):
        """
        Set the shape of a dynamic input binding.
        """
        if not self.dynamic:
            raise ValueError(f"binding '{self.name}' is not dynamic")
            
        if not self.input:
            raise ValueError(f"binding '{self.name}' is not an input")
            
        # check to see if the shape already matches
        if self.shape == shape:
            logging.debug(f"binding '{self.name}' already has shape {shape}")
            return
            
        logging.debug(f"binding '{self.name}' has new shape {shape}")
        
        # set the new shape
        if not self.model.trt_context.set_binding_shape(self.index, shape):
            raise ValueError(f"failed to set binding '{self.name}' with shape {shape}")
           
        # re-allocate tensor memory
        self.alloc(shape)
    
    def query_shape(self):
        """
        Updates the shape of a dynamic output binding.
        """
        if not self.dynamic:
            return
            
        if self.input:
            raise ValueError(f"binding '{self.name}' is not an output")
        
        # get the new shape
        shape = tuple(self.model.trt_context.get_binding_shape(self.index))
        
        # check to see if the shape already matches
        if self.shape == shape:
            logging.debug(f"binding '{self.name}' already has shape {shape}")
            return
        
        logging.debug(f"binding '{self.name}' has new output shape {shape}")
        
        # re-allocate tensor memory
        self.alloc(shape)
        return shape
        
    def __str__(self):
        return (
            f"binding {self.index} - '{self.name}'\n"
            f"   input:    {self.input}\n"
            f"   shape:    {self.shape}\n"
            f"   dtype:    {self.dtype}\n"
            f"   size:     {self.size}\n"
            f"   dynamic:  {self.dynamic}\n"
            f"   profiles: {self.profiles}\n"
        )

================================================
FILE: jetson_voice/backends/tensorrt/trt_builder.py
================================================
#!/usr/bin/env python3
# coding: utf-8

import os
import time
import json
import logging
import tensorrt as trt

import pycuda.driver as cuda
import pycuda.autoinit

TRT_LOGGER = trt.Logger(trt.Logger.VERBOSE)

def build_engine(config, 
                 output=None, 
                 precision='fp16',
                 batch_size=1,
                 dynamic_shapes=None,
                 workspace=128, 
                 parse_only=False):
    """
    Build TensorRT engine from ONNX model.
    
    Parameters:
      model (string) -- path to ONNX model
      config (string) -- path to model configuration json (will be inferred from model path if empty)
      output (string) -- path to output serialized TensorRT engine (will be inferred from model path if empty)
      precision (string) -- fp32 or fp16 (int8 not currently supported)
      batch_size (int) -- the maximum batch size (default 1)
      dynamic_shape (dict) -- dynamic shape profiles for min/max/opt
      workspace (int) -- builder workspace memory size (in MB)
      parse_only (bool) -- if true, test parsing the model before exiting without building the TensorRT engine
      
    Returns the built TensorRT engine (ICudaEngine)
    """
    # set default output path
    if output is None or output == '':
        output = f'{os.path.splitext(config.model_path)[0]}.engine'

    # create TensorRT resources
    builder = trt.Builder(TRT_LOGGER)
    builder_config = builder.create_builder_config()
    network = builder.create_network(1 << (int)(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH))
    parser = trt.OnnxParser(network, TRT_LOGGER)
    
    builder_config.max_workspace_size = workspace * 1 << 20
    
    # set precision
    precision = precision.lower()
    
    if precision == 'fp16':
        builder_config.set_flag(trt.BuilderFlag.FP16)
        logging.info(f'enabled FP16 precision')
    elif precision == 'int8':
        # https://github.com/NVIDIA/TensorRT/blob/d7baf010e4396c87d58e4d8a33052c01c2d89325/demo/BERT/builder.py#L592
        raise NotImplementedError('INT8 support not yet implemented')
        
    # load the model (from ONNX)
    logging.info(f'loading {config.model_path}')
    
    with open(config.model_path, 'rb') as model_file:
        if not parser.parse(model_file.read()):
            logging.error(f'failed to parse ONNX model {config.model_path}')
            for error in range(parser.num_errors): 
                print (parser.get_error(error))
            return None 

    # create dynamic shape profile
    # TODO refactor this to an abstract .get_dynamic_shapes() implementation in each subclass
    # TODO this currently uses same shape for all inputs - allow for different shape profiles
    profile = builder.create_optimization_profile()
    opt_shape = None
    
    """
    if model_type == 'qa' or model_type == 'text_classification' or model_type == 'token_classification':
        min_shape = (1, 1)  # (batch_size, sequence_length)
        max_shape = (batch_size, model_config['dataset']['max_seq_length'])
    elif model_type == 'intent_slot':
        min_shape = (1, 1)  # (batch_size, sequence_length)
        max_shape = (batch_size, model_config['language_model']['max_seq_length'])
    elif model_type == 'asr':
        features = model_config['preprocessor']['features']
        sample_rate = model_config['preprocessor']['sample_rate']
        sample_to_fft = 1.0 / 160.0  # rough conversion from samples to MEL spectrogram dims
        sample_multiplier = sample_rate * sample_to_fft
        
        min_shape = (batch_size, features, int(0.5 * sample_multiplier))  # minimum plausible frame length
        opt_shape = (batch_size, features, int(1.2 * sample_multiplier))  # default of .1s overlap factor (1,64,121)
        max_shape = (batch_size, features, int(3.0 * sample_multiplier))  # enough for 1s overlap factor
    elif model_type == 'asr_classification':
        features = model_config['preprocessor']['n_mels']
        sample_rate = model_config['sample_rate']
        sample_to_fft = 1.0 / 160.0  # rough conversion from samples to MEL spectrogram dims
        sample_multiplier = sample_rate * sample_to_fft
        
        min_shape = (batch_size, features, int(0.5 * sample_multiplier))  # minimum plausible frame length
        opt_shape = (batch_size, features, int(1.2 * sample_multiplier))  # default of .1s overlap factor (1,64,121)
        max_shape = (batch_size, features, int(3.0 * sample_multiplier))  # enough for 1s overlap factor
    elif model_type == 'tts_vocoder':
        min_shape = (batch_size, model_config['features'], 1)
        opt_shape = (batch_size, model_config['features'], 160)  # ~5-6 words
        max_shape = (batch_size, model_config['features'], 512)  # ~15-20 words?
    else:
        raise NotImplementedError(f"model type '{model_type}' is unrecognized or not supported")
    """           
    
    # TODO support different shape profiles for different input tensors
    if dynamic_shapes is not None:        
        if 'min' not in dynamic_shapes:
            dynamic_shapes['min'] = dynamic_shapes['max']
            
        if 'opt' not in dynamic_shapes:
            dynamic_shapes['opt'] = dynamic_shapes['max']
            
        for i in range(network.num_inputs):  # TODO confirm that input is in fact dynamic
            profile.set_shape(network.get_input(i).name, min=dynamic_shapes['min'], opt=dynamic_shapes['opt'], max=dynamic_shapes['max'])

        builder_config.add_optimization_profile(profile)
                    
    def print_summary():
        print('')
        print('----------------------------------------------------')
        print(' BUILDER CONFIGURATION')
        print('----------------------------------------------------')
        print(f'  - model     {config.model_path}')
        print(f'  - config    {config.path}')
        print(f'  - output    {output}')
        print(f'  - type      {config.type}')
        print(f'  - layers    {network.num_layers}')
        print(f'  - inputs    {network.num_inputs}')
        print(f'  - outputs   {network.num_outputs}')
        print(f'  - precision {precision}')
        print(f'  - workspace {workspace}')
        print('')
        
        for i in range(network.num_inputs):
            tensor = network.get_input(i)
            
            print(f'  - input {i}:')
            print(f'      - name     {tensor.name}')
            print(f'      - shape    {tensor.shape}')
            print(f'      - dtype    {tensor.dtype}')
            
        for i in range(network.num_outputs):
            tensor = network.get_output(i)
            
            print(f'  - output {i}:')
            print(f'      - name     {tensor.name}')
            print(f'      - shape    {tensor.shape}')
            print(f'      - dtype    {tensor.dtype}')
           
    print_summary()
    
    if parse_only:
        return None
    
    # build the engine
    build_start_time = time.time()
    
    engine = builder.build_engine(network, builder_config)
    
    if engine is None:
        raise ValueError(f"failed to build TensorRT engine for '{config.model_path}'")
        
    build_time_elapsed = (time.time() - build_start_time)
    print(f'\nbuilt engine in {build_time_elapsed} seconds')

    print_summary()
    
    # save engine
    print('\nserializing engine...')
    serialized_engine = engine.serialize()
    with open(output, "wb") as engine_file:
        engine_file.write(serialized_engine)
    print(f'saved engine to {output}')
        
    return engine
        

'''
if __name__ == "__main__":

    import argparse

    parser = argparse.ArgumentParser()
    
    parser.add_argument('--config', default='', type=str)
    parser.add_argument('--output', default='', type=str)
    parser.add_argument('--precision', default='fp16', choices=['fp32', 'fp16', 'int8'], type=str)
    parser.add_argument('--batch-size', default=1, type=int) # max batch size
    parser.add_argument('--workspace', default=utils.DEFAULT_WORKSPACE, type=int)
    parser.add_argument('--parse-only', action='store_true')
    
    args = parser.parse_args()
    print(args)
    
    build_engine(config=args.config,
                 output=args.output,
                 precision=args.precision,
                 batch_size=args.batch_size,
                 workspace=args.workspace,
                 parse_only=args.parse_only)
'''



================================================
FILE: jetson_voice/backends/tensorrt/trt_model.py
================================================
#!/usr/bin/env python3
# coding: utf-8

import os
import time
import json
import logging
import pprint

import numpy as np
import tensorrt as trt

import pycuda.driver as cuda
import pycuda.autoinit

from .trt_builder import build_engine, TRT_LOGGER
from .trt_binding import Binding


class TRTModel:
    """
    Base class for TensorRT models.
    """
    def __init__(self, config, dynamic_shapes=None, *args, **kwargs):
        """
        Load a TensorRT model from ONNX or serialized TensorRT engine.
        
        Parameters:
          config (ConfigDict) -- configuration dict
          dynamic_shapes (dict) -- dynamic shape profiles for min/max/opt
        """
        self.config = config
            
        # determine if the TensorRT engine already exists
        model_root, model_ext = os.path.splitext(self.config.model_path)
        model_ext = model_ext.lower()
        
        if model_ext == '.onnx':
            engine_path = model_root + '.engine'
            if os.path.exists(engine_path):
                logging.info(f'loading cached TensorRT engine from {engine_path}')
                self.config.model_path = engine_path
                model_ext = '.engine'
                
        # either build or load TensorRT engine
        if model_ext == '.onnx':
            self.trt_engine = build_engine(self.config, dynamic_shapes=dynamic_shapes)
        elif model_ext == '.engine' or model_ext == '.plan':
            with open(self.config.model_path, 'rb') as f:
                self.trt_runtime = trt.Runtime(TRT_LOGGER)
                self.trt_engine  = self.trt_runtime.deserialize_cuda_engine(f.read())
        else:
            raise ValueError(f"invalid model extension '{model_ext}' (should be .onnx, .engine, or .plan)")
            
        if self.trt_engine is None:
            raise IOError(f'failed to load TensorRT engine from {self.model_path}')
                
        self.trt_context = self.trt_engine.create_execution_context()
        logging.info(f'loaded TensorRT engine from {self.config.model_path}')

        # create a stream in which to copy inputs/outputs and run inference
        self.stream = cuda.Stream()
        
        # enumerate bindings
        self.bindings = []
        self.inputs  = []
        self.outputs = []

        for i in range(len(self.trt_engine)):
            binding = Binding(self, i)
            self.bindings.append(binding)
            
            if binding.input:
                self.inputs.append(binding)
            else:
                self.outputs.append(binding)
        
        for binding in self.bindings:
            print(f'\n{binding}')

    def execute(self, inputs, sync=True, return_dict=False, **kwargs):
        """
        Run the DNN model in TensorRT.  The inputs are provided as numpy arrays in a list/tuple/dict.
        Note that run() doesn't perform any pre/post-processing - this is typically done in subclasses.
        
        Parameters:
          inputs (array, list[array], dict[array]) -- the network inputs as numpy array(s).
                         If there is only one input, it can be provided as a single numpy array.
                         If there are multiple inputs, they can be provided as numpy arrays in a
                         list, tuple, or dict.  Inputs in lists and tuples are assumed to be in the
                         same order as the input bindings.  Inputs in dicts should have keys with the
                         same names as the input bindings.
          sync (bool) -- If True (default), will wait for the GPU to be done processing before returning.
          return_dict (bool) -- If True, the results will be returned in a dict of numpy arrays, where the
                                keys are the names of the output binding names. By default, the results will 
                                be returned in a list of numpy arrays, in the same order as the output bindings.
          
        Returns the model output as a numpy array (if only one output), list[ndarray], or dict[ndarray].
        """
        if isinstance(inputs, np.ndarray):
            inputs = [inputs]
        
        assert len(inputs) == len(self.inputs)
        
        # setup inputs + copy to GPU
        def setup_binding(binding, input):
            input = input.astype(trt.nptype(binding.dtype), copy=False)
            if binding.dynamic: 
                binding.set_shape(input.shape)
            cuda.memcpy_htod_async(binding.device, np.ascontiguousarray(input), self.stream)
            
        if isinstance(inputs, (list,tuple)):
            for idx, input in enumerate(inputs):
                setup_binding(self.bindings[idx], input)
        elif isinstance(inputs, dict):        
            for binding_name in inputs:
                setup_binding(self.find_binding(binding_name), inputs[binding_name])
        else:
            raise ValueError(f"inputs must be a list, tuple, or dict (instead got type '{type(inputs).__name__}')")
            
        assert self.trt_context.all_binding_shapes_specified
        assert self.trt_context.all_shape_inputs_specified 
        
        # query new dynamic output shapes
        for output in self.outputs:
            output.query_shape()

        # run inference
        self.trt_context.execute_async_v2(
            bindings=[int(binding.device) for binding in self.bindings], 
            stream_handle=self.stream.handle
        )
          
        # copy outputs to CPU
        for output in self.outputs:
            cuda.memcpy_dtoh_async(output.host, output.device, self.stream)
          
        # wait for completion
        if sync:
            self.stream.synchronize()
            
        # return results
        if return_dict:
            results = {}
            for output in self.outputs:
                results[output.name] = output.host
            return results
        else:
            if len(self.outputs) == 1:
                return self.outputs[0].host
            else:
                return tuple([output.host for output in self.outputs])

    def find_binding(self, name):
        """
        Lookup an input/output binding by name
        """
        for binding in self.bindings:
            if binding.name == name: 
                return binding   
        logging.error(f"couldn't find binding with name '{name}'")
        return None
        
    def set_shape(self, binding, shape):
        """
        Set the shape of a dynamic binding.
        """
        if isinstance(binding, int):
            binding = self.bindings[binding]
        elif isinstance(binding, str):
            binding = self.find_binding(binding)
        elif not isinstance(binding, dict):
            raise ValueError(f'binding must be specified as int, string, or dict (got {type(binding).__name__})')
            
        binding.set_shape(shape)
    


================================================
FILE: jetson_voice/models/__init__.py
================================================
#!/usr/bin/env python3
# coding: utf-8

from .asr import ASREngine
from .nlp import IntentSlotEngine, QuestionAnswerEngine, TextClassificationEngine, TokenClassificationEngine
from .tts import TTSEngine

================================================
FILE: jetson_voice/models/asr/__init__.py
================================================
#!/usr/bin/env python3
# coding: utf-8

from .asr_engine import ASREngine


================================================
FILE: jetson_voice/models/asr/asr_engine.py
================================================
#!/usr/bin/env python3
# coding: utf-8

import os
import time
import pprint
import logging
import importlib

import torch
import numpy as np

from .ctc_decoder import CTCDecoder

from jetson_voice.asr import ASRService
from jetson_voice.utils import audio_to_float, global_config, load_model, softmax

      
class ASREngine(ASRService):
    """
    Streaming ASR (Automatic Speech Recognition) model in TensorRT or onnxruntime.
    This model is primarily designed to be used on a live audio source like a microphone.
    """
    def __init__(self, config, *args, **kwargs):
        """
        Loads a streaming ASR model from ONNX or serialized TensorRT engine.
        
        Parameters:
          model (string) -- path to ONNX model or serialized TensorRT engine/plan
          config (string) -- path to model configuration json (will be inferred from model path if empty)
        """
        super(ASREngine, self).__init__(config, *args, **kwargs)

        if self.config.type != 'asr' and self.config.type != 'asr_classification':
            raise ValueError(f"{self.config.model_path} isn't an ASR model (type '{self.config.type}'")

        # set some default config options that are non-standard in nemo
        if 'streaming' not in self.config:
            self.config['streaming'] = {}
        
        self.config['streaming'].setdefault('frame_length', 1.0)     # duration of signal frame, seconds (TODO shorter defaults for VAD/command classifiers)
        self.config['streaming'].setdefault('frame_overlap', 0.5)    # duration of overlap before/after current frame, seconds
        
        # some config changes for streaming
        if not self.classification:
            self.config['preprocessor']['dither'] = 0.0
            self.config['preprocessor']['pad_to'] = 0
        
            if 'ctc_decoder' not in self.config:
                self.config['ctc_decoder'] = {}
                    
            self.config['ctc_decoder'].setdefault('type', 'greedy')        # greedy or beamsearch
            self.config['ctc_decoder'].setdefault('add_punctuation', True) # add period to the end of sentences
        
            if 'add_punctuation' in kwargs:
                self.config['ctc_decoder']['add_punctuation'] = kwargs['add_punctuation']
                logging.info(f"add_punctuation = {kwargs['add_punctuation']}")
                
        if not self.classification and self.config['preprocessor']['features'] == 64:   # TODO normalization coefficients for citrinet (N=80)
            normalization = {}

            normalization['fixed_mean'] = [
                 -14.95827016, -12.71798736, -11.76067913, -10.83311182,
                 -10.6746914,  -10.15163465, -10.05378331, -9.53918999,
                 -9.41858904,  -9.23382904,  -9.46470918,  -9.56037,
                 -9.57434245,  -9.47498732,  -9.7635205,   -10.08113074,
                 -10.05454561, -9.81112681,  -9.68673603,  -9.83652977,
                 -9.90046248,  -9.85404766,  -9.92560366,  -9.95440354,
                 -10.17162966, -9.90102482,  -9.47471025,  -9.54416855,
                 -10.07109475, -9.98249912,  -9.74359465,  -9.55632283,
                 -9.23399915,  -9.36487649,  -9.81791084,  -9.56799225,
                 -9.70630899,  -9.85148006,  -9.8594418,   -10.01378735,
                 -9.98505315,  -9.62016094,  -10.342285,   -10.41070709,
                 -10.10687659, -10.14536695, -10.30828702, -10.23542833,
                 -10.88546868, -11.31723646, -11.46087382, -11.54877829,
                 -11.62400934, -11.92190509, -12.14063815, -11.65130117,
                 -11.58308531, -12.22214663, -12.42927197, -12.58039805,
                 -13.10098969, -13.14345864, -13.31835645, -14.47345634]
                 
            normalization['fixed_std'] = [
                 3.81402054, 4.12647781, 4.05007065, 3.87790987,
                 3.74721178, 3.68377423, 3.69344,    3.54001005,
                 3.59530412, 3.63752368, 3.62826417, 3.56488469,
                 3.53740577, 3.68313898, 3.67138151, 3.55707266,
                 3.54919572, 3.55721289, 3.56723346, 3.46029304,
                 3.44119672, 3.49030548, 3.39328435, 3.28244406,
                 3.28001423, 3.26744937, 3.46692348, 3.35378948,
                 2.96330901, 2.97663111, 3.04575148, 2.89717604,
                 2.95659301, 2.90181116, 2.7111687,  2.93041291,
                 2.86647897, 2.73473181, 2.71495654, 2.75543763,
                 2.79174615, 2.96076456, 2.57376336, 2.68789782,
                 2.90930817, 2.90412004, 2.76187531, 2.89905006,
                 2.65896173, 2.81032176, 2.87769857, 2.84665271,
                 2.80863137, 2.80707634, 2.83752184, 3.01914511,
                 2.92046439, 2.78461139, 2.90034605, 2.94599508,
                 2.99099718, 3.0167554,  3.04649716, 2.94116777]
                 
            self.config['preprocessor']['normalize'] = normalization
        
        # create preprocessor instance
        preprocessor_name = self.config['preprocessor']['_target_'].rsplit(".", 1)
        preprocessor_class = getattr(importlib.import_module(preprocessor_name[0]), preprocessor_name[1])
        logging.debug(f'ASR preprocessor - {preprocessor_class}')

        preprocessor_config = self.config['preprocessor'].copy()
        preprocessor_config.pop('_target_')

        self.preprocessor = preprocessor_class(**preprocessor_config)

        # load the model
        features = self.config.preprocessor.n_mels if self.classification else self.config.preprocessor.features
        time_to_fft = self.sample_rate * (1.0 / 160.0)     # rough conversion from samples to MEL spectrogram dims
        
        dynamic_shapes = {
            'min' : (1, features, int(0.1 * time_to_fft)), # minimum plausible frame length
            'opt' : (1, features, int(1.5 * time_to_fft)), # default of .5s overlap factor (1,64,121)
            'max' : (1, features, int(3.0 * time_to_fft))  # enough for 2s overlap factor
        }
        
        self.model = load_model(self.config, dynamic_shapes)
        
        # create CTC decoder
        if not self.classification:
            self.ctc_decoder = CTCDecoder.from_config(self.config['ctc_decoder'],
                                                      self.config['decoder']['vocabulary'],
                                                      os.path.dirname(self.config.model_path))
                                                      
            logging.info(f"CTC decoder type: '{self.ctc_decoder.type}'")
            
        # create streaming buffer
        self.n_frame_len = int(self.frame_length * self.sample_rate)
        self.n_frame_overlap = int(self.frame_overlap * self.sample_rate)
        
        self.buffer_length = self.n_frame_len + self.n_frame_overlap
        self.buffer_duration = self.buffer_length / self.sample_rate
        
        self.buffer = np.zeros(shape=self.buffer_length, dtype=np.float32)  # 2*self.n_frame_overlap
    
        
    def __call__(self, samples):
        """
        Transcribe streaming audio samples to text, returning the running phrase.
        Phrases are broken up when a break in the audio is detected (i.e. end of sentence)
        
        Parameters:
          samples (array) -- Numpy array of audio samples.

        Returns a dict of the running phrase.
          transcript (string) -- the current transcript
          latest (string) -- the latest additions to the transcript
          end (bool) -- if true, end-of-sequence due to silence
        """
        samples = audio_to_float(samples)
        
        if len(samples) < self.n_frame_len:
            samples = np.pad(samples, [0, self.n_frame_len - len(samples)], 'constant')
            
        self.buffer[:self.n_frame_overlap] = self.buffer[-self.n_frame_overlap:]
        self.buffer[self.n_frame_overlap:] = samples
        
        if global_config.profile: preprocess_begin = time.perf_counter()
        
        # apply pre-processing
        preprocessed_signal, _ = self.preprocessor(
            input_signal=torch.as_tensor(self.buffer, dtype=torch.float32).unsqueeze(dim=0), 
            length=torch.as_tensor(self.buffer.size, dtype=torch.int64).unsqueeze(dim=0)
        )

        if global_config.profile:
            logging.info(f'preprocess time: {time.perf_counter() - preprocess_begin}')
            network_begin = time.perf_counter()
        
        # run the asr model
        logits = self.model.execute(torch_to_numpy(preprocessed_signal))
        logits = np.squeeze(logits)
        logits = softmax(logits, axis=-1)

        if global_config.profile: logging.info(f'network time: {time.perf_counter() - network_begin}')
        
        self.timestep_duration = self.buffer_duration / logits.shape[0]
        self.n_timesteps_frame = int(self.frame_length / self.timestep_duration)
        self.n_timesteps_overlap = int(self.frame_overlap / self.timestep_duration)

        if self.classification:
            argmax = np.argmax(logits)
            prob = logits[argmax]
            return (self.config['labels'][argmax], prob)
        else:
            self.ctc_decoder.set_timestep_duration(self.timestep_duration)
            self.ctc_decoder.set_timestep_delta(self.n_timesteps_frame)

            if global_config.profile: ctc_decoder_begin = time.perf_counter()
            transcripts = self.ctc_decoder.decode(logits)
            if global_config.profile: logging.info(f'ctc_decoder time: {time.perf_counter() - ctc_decoder_begin}')
            
            return transcripts

    @property
    def classification(self):
        """
        Returns true if this is an ASR classification model.
        """
        return self.config.type == 'asr_classification'
        
    @property
    def sample_rate(self):
        """
        The sample rate that the model runs at.
        Input audio should be resampled to this rate.
        """
        return self.config['sample_rate'] if self.classification else self.config['preprocessor']['sample_rate']
        
    @property
    def frame_length(self):
        """
        Duration in seconds per frame / chunk.
        """
        return self.config['streaming']['frame_length']
        
    @property
    def frame_overlap(self):
        """
        Duration of overlap in seconds before/after current frame.
        """
        return self.config['streaming']['frame_overlap']
    
    @property
    def chunk_size(self):
        """
        Number of samples per frame/chunk (equal to frame_length * sample_rate)
        """
        return self.n_frame_len


def torch_to_numpy(tensor):
    return tensor.detach().cpu().numpy() if tensor.requires_grad else tensor.cpu().numpy()
    
                    

================================================
FILE: jetson_voice/models/asr/ctc_beamsearch.py
================================================
#!/usr/bin/env python3
# coding: utf-8

import os
import logging

from .ctc_decoder import CTCDecoder
from .ctc_utils import find_silent_intervals, merge_words, rebase_word_times, split_words, transcript_from_words

from ctc_decoders import Scorer
from swig_decoders import BeamDecoder, ctc_beam_search_decoder_ex

from jetson_voice.utils import global_config


class CTCBeamSearchDecoder(CTCDecoder):
    """
    CTC beam search decoder that optionally uses a language model.
    """
    def __init__(self, config, vocab, resource_path=None):
        """
        Create a new CTCBeamSearchDecoder.
        
        See CTCDecoder.from_config() to automatically create
        the correct type of instance dependening on config.
        """
        super().__init__(config, vocab)
        self.config.setdefault('word_threshold', -1000.0)
        self.reset()
        
        self.scorer = None    
        #self.num_cores = max(os.cpu_count(), 1)
        
        # set default config
        # https://github.com/NVIDIA/NeMo/blob/855ce265b80c0dc40f4f06ece76d2c9d6ca1be8d/nemo/collections/asr/modules/beam_search_decoder.py#L21
        self.config.setdefault('language_model', None)
        self.config.setdefault('beam_width', 32)#128)
        self.config.setdefault('alpha', 0.7 if self.language_model else 0.0)
        self.config.setdefault('beta', 0.0)
        self.config.setdefault('cutoff_prob', 1.0)
        self.config.setdefault('cutoff_top_n', 40)
        self.config.setdefault('top_k', 3)
        
        # check for language model file
        if self.language_model:
            if not os.path.isfile(self.language_model):
                self.config['language_model'] = os.path.join(resource_path, self.language_model)
                if not os.path.isfile(self.language_model):
                    raise IOError(f"language model file '{self.language_model}' does not exist")
                    
        logging.info('creating CTCBeamSearchDecoder')
        logging.info(str(self.config))
        
        # create scorer
        if self.language_model:
            self.scorer = Scorer(self.config['alpha'],
                                 self.config['beta'],
                                 model_path=self.language_model,
                                 vocabulary=self.vocab)
            
    def decode(self, logits):
        """
        Decode logits into words, and merge the new words with the
        previous words from the running transcript.
        
        Returns the running transcript as a list of word dictionaries, 
        where each word dict has he following keys:
        
           'text' (str) -- the text of the word
           'score' (float) -- the probability of the word
           'start_time' (int) -- the start time of the word (in timesteps)
           'end_time' (int) -- the end time of the word (in timesteps)
           
        Note that the start/end times are transformed from timestamps into
        seconds by the ASR engine after CTCDecoder.decode() is called.
        """
        results = ctc_beam_search_decoder_ex(
            logits.tolist(), 
            self.vocab,
            self.config['beam_width'], 
            self.config['cutoff_prob'], 
            self.config['cutoff_top_n'], 
            self.config['top_k'],
            self.timestep,
            self.scorer)
        
        
        if global_config.debug:
            print('BeamSearch results', len(results))
            for idx, result in enumerate(results):
                print(f"  beam {idx} [{result.score:.3f}] '{result.text}'")
                for word_idx, word in enumerate(result.words):
                    print(f"    word {word_idx} [{word.start_time}:{word.end_time} {word.score:.3f}] '{word.text}'")
                
        words = [{
            'text' : word.text,
            'score' : word.score,
            'start_time' : word.start_time,
            'end_time' : word.end_time
        } for word in results[0].words]
        
        # merge new words with past words
        self.words = merge_words(self.words, words, self.config['word_threshold'], 'similarity')
        
        # look for silent/EOS intervals
        silent_intervals = find_silent_intervals(logits, len(self.vocab), self.timesteps_silence, self.timestep) 
        
        if global_config.debug: 
            print(f'silent intervals:  {silent_intervals}')

        self.timestep += self.timestep_delta
        
        # split the words at EOS intervals
        if len(silent_intervals) > 0:
            wordlists = split_words(self.words, silent_intervals)
            transcripts = []
            
            for idx, wordlist in enumerate(wordlists):
                # ignore blanks (silence after EOS has already occurred)
                if len(wordlist) == 0:
                    continue
                    
                # if there is only one wordlist, then it must be EOS
                # if there are multiple, then the last one is not EOS
                end = (len(wordlists) == 1) or (idx < (len(wordlists) - 1))
                
                if end:
                    wordlist = rebase_word_times(wordlist)
                    self.reset()            # TODO reset timesteps counter correctly
                else:
                    self.words = wordlist   
                    
                transcripts.append((wordlist, end))
        else:
            transcripts = [(self.words, False)]

        return [{
            'text' : transcript_from_words(words, scores=global_config.debug, times=global_config.debug, end=end, add_punctuation=self.config['add_punctuation']),
            'words' : words,
            'end' : end
        } for words, end in transcripts]
        
    def reset(self):
        """
        Reset the CTC decoder state at EOS (end of sentence)
        """
        #self.timestep = 0
        #self.tail_silence = 0
        self.words = []
        
    @property
    def language_model(self):
        return self.config['language_model']
 

================================================
FILE: jetson_voice/models/asr/ctc_decoder.py
================================================
#!/usr/bin/env python3
# coding: utf-8

        
class CTCDecoder:
    """
    CTC decoder base class for ASR.
    """    
    @staticmethod
    def from_config(config, vocab, resource_path=None):
        """
        Static factory function to instantiate the correct
        CTC decoder instance type from the config.
        
           config['type'] == 'greedy' -> CTCGreedyDecoder
           config['type'] == 'beamsearch' -> CTCBeamSearchDecoder
        """
        type = config['type'].lower()
        
        if type == 'greedy':
            from .ctc_greedy import CTCGreedyDecoder
            return CTCGreedyDecoder(config, vocab)
        elif type == "beamsearch":
            from .ctc_beamsearch import CTCBeamSearchDecoder
            return CTCBeamSearchDecoder(config, vocab, resource_path)
        else:
            raise ValueError(f"invalid/unrecognized CTC decoder type '{type}'")
            
    def __init__(self, config, vocab):
        """
        See CTCDecoder.from_config() to automatically create
        the correct type of instance dependening on config.
        """
        self.config = config
        self.vocab = vocab
        self.timestep = 0
        
        self.config.setdefault('vad_eos_duration', 0.65)  # max silent time until end-of-sentence
        self.config.setdefault('timestep_offset', 5)      # number of symbols to drop for smooth streaming
        
    def decode(self, logits):
        """
        Decode logits into words, and merge the new words with the
        previous words from the running transcript.
        
        Returns the running transcript as a list of word dictionaries, 
        where each word dict has he following keys:
        
           'text' (str) -- the text of the word
           'score' (float) -- the probability of the word
           'start_time' (int) -- the start time of the word (in timesteps)
           'end_time' (int) -- the end time of the word (in timesteps)
           
        Note that the start/end times are transformed from timestamps into
        seconds by the ASR engine after CTCDecoder.decode() is called.
        """
        pass
        
    def reset(self):
        """
        Reset the CTC decoder state at EOS (end of sentence)
        """
        pass

    def set_timestep(self, timestep):
        """
        Set the current timestep.
        """
        self.timestep = timestep
    
    def set_timestep_delta(self, offset):
        """
        Set the number of timesteps per frame.
        """
        self.timestep_delta = offset - self.config['timestep_offset']
        
    def set_timestep_duration(self, duration):
        """
        Set the duration of each timestep, in seconds.
        """
        self.timestep_duration = duration
        self.timesteps_silence = self.config['vad_eos_duration'] / self.timestep_duration
             
    @property
    def type(self):
        """
        Return the CTC decoder type string ('greedy' or 'beamsearch')
        """
        return self.config['type'].lower() 
        
 

================================================
FILE: jetson_voice/models/asr/ctc_greedy.py
================================================
#!/usr/bin/env python3
# coding: utf-8

import string
import numpy as np

from .ctc_decoder import CTCDecoder
from .ctc_utils import merge_words, transcript_from_words

from jetson_voice.utils import global_config


class CTCGreedyDecoder(CTCDecoder):
    """
    CTC greedy decoder that simply chooses the highest-probability logits.
    """
    def __init__(self, config, vocab):
        """
        Create a new CTCGreedyDecoder.
        TODO document config.
        
        See CTCDecoder.from_config() to automatically create
        the correct type of instance dependening on config.
        """
        super().__init__(config, vocab)
        
        self.config.setdefault('word_threshold', 0.1)
        
        # add blank symbol to vocabulary
        if '_' not in vocab:
            self.vocab = vocab.copy()
            self.vocab.append('_')
            
        self.reset()
        
    def decode(self, logits):
        """
        Decode logits into words, and merge the new words with the
        previous words from the running transcript.
        
        Returns the running transcript as a list of word dictionaries, 
        where each word dict has he following keys:
        
           'text' (str) -- the text of the word
           'score' (float) -- the probability of the word
           'start_time' (int) -- the start time of the word (in timesteps)
           'end_time' (int) -- the end time of the word (in timesteps)
           
        Note that the start/end times are transformed from timestamps into
        seconds by the ASR engine after CTCDecoder.decode() is called.
        """
        text = []
        prob = 1.0
        probs = []
        
        # select the chars with the max probability
        for i in range(logits.shape[0]):
            argmax = np.argmax(logits[i])
            text.append(self.vocab[argmax])
            probs.append(logits[i][argmax])
              
        if global_config.debug:
            print(text)
            
        # get the max number of sequential silent timesteps (continuing from last frame)
        silent_timesteps = self.end_silent_timesteps
        max_silent_timesteps = 0
        
        for i in range(len(text)):
            if text[i] == '_':
                silent_timesteps += 1
            else:
                max_silent_timesteps = max(silent_timesteps, max_silent_timesteps) if i > 0 else 0
                silent_timesteps = 0
        
        if text[-1] == '_':
            self.end_silent_timesteps = silent_timesteps
           
        # merge repeating chars and blank symbols
        _, words = self.merge_chars(text, probs)  #text[:len(text)-self.config['offset']]
        
        # merge new words with past words
        words = merge_words(self.words, words, self.config['word_threshold'], 'overlap')
        
        # increment timestep (after this frame's timestep is done being used, and before a potential EOS reset)
        self.timestep += self.timestep_delta
        
        # check for EOS
        end = False
        
        if silent_timesteps > self.timesteps_silence:
            end = True
            self.reset()
        else:
            self.words = words
            
        return [{
            'text' : transcript_from_words(words, scores=global_config.debug, times=global_config.debug, end=end, add_punctuation=self.config['add_punctuation']),
            'words' : words,
            'end' : end
        }]
           
    def merge_chars(self, text, probs):
        """
        Merge repeating chars and blank symbols into words.
        """
        text_merged = ''
        
        word = None
        words = []

        def ispunct(ch):
            return ch in (string.punctuation + ' ')
            
        for i in range(len(text)):
            if text[i] != self.prev_char and text[i] != '_':
                self.prev_char = text[i]
                
                if text[i] != '_':
                    text_merged += text[i]

                    if not ispunct(text[i]):
                        if word is None:
                            word = {
                                'text' : text[i],
                                'score' : probs[i],
                                'start_char' : len(text_merged) - 1,
                                'end_char' : len(text_merged),
                                'start_time' : self.timestep + i,
                                'end_time' : self.timestep + i + 1
                            }
                        else:
                            word['text'] += text[i]
                            word['score'] *= probs[i]
                            word['end_char'] = len(text_merged)
                            word['end_time'] = self.timestep + i + 1
    
                if ispunct(text[i]) and word is not None:
                    words.append(word)
                    word = None
            
        if word is not None:
            words.append(word)
                
        return text_merged, words
        
    def reset(self):
        """
        Reset the CTC decoder state at EOS (end of sentence)
        """
        self.prev_char = ''
        self.end_silent_timesteps = 0
        self.timestep = 0
        self.words = []

 

================================================
FILE: jetson_voice/models/asr/ctc_utils.py
================================================
#!/usr/bin/env python3
# coding: utf-8

import nltk
import numpy as np

from jetson_voice.utils import global_config


def transcript_from_words(words, scores=False, times=False, end=False, add_punctuation=True):
    """
    Convert a list of words to the text transcript.
    """
    transcript = ''
    
    for idx, word in enumerate(words):
    
        if scores and times:
            transcript += f"{word['text']} ({word['start_time']}:{word['end_time']} {word['score']:.2f})"
        elif scores:
            transcript += f"{word['text']} ({word['score']:.2f})"
        elif times:
            transcript += f"{word['text']} ({word['start_time']}:{word['end_time']})"
        else:
            transcript += word['text']
        
        if idx < len(words) - 1:
            transcript += ' '
      
    if end and add_punctuation:
        transcript += '.'  # add punctuation to end
      
    return transcript
        

def find_overlapping_word(wordlist, word):
    """
    Find the first word from the list with overlapping times.
    Returns a (word, index) tuple or (None, -1) if no overlap found.
    """
    for idx, word2 in enumerate(wordlist):
        if not (word['end_time'] < word2['start_time'] or word['start_time'] > word2['end_time']):
            return word2, idx 
    return None, -1


def find_word_after(wordlist, time):
    """
    Find the nearest word that starts after the time.
    Returns a (word, index) tuple or (None, 1) if all words start before the time.
    """
    if isinstance(time, tuple):
        time = time[1]  # use the end time
        
    for idx, word in enumerate(wordlist):
        if time <= word['start_time']:
            return word, idx        
            
    return None, -1


def find_word_before(wordlist, time):
    """
    Find the nearest word that starts after the time.
    Returns a (word, index) tuple or (None, 1) if all words start after the time.
    """
    if isinstance(time, tuple):
        time = time[0]  # use the start time
        
    for idx in range(len(wordlist)-1, -1, -1):
        if time >= wordlist[idx]['end_time']:
            return wordlist[idx], idx    
            
    return None, -1


def merge_words(wordlist, words, score_threshold=-np.inf, method='overlap'):
    """
    Merge new words with past words.  This works by finding overlapping or similar words,
    and replacing the old word with new word if the new word has a higher probability.
    """
    if len(words) == 0:
        return wordlist
        
    if len(wordlist) == 0:
        return words
        
    # short-circuit if these are all new words    
    if words[0]['start_time'] > wordlist[-1]['end_time']:
        wordlist.extend(words)
        return wordlist
         
    if method == 'overlap':
        # find words that overlap and pick the highest-scoring one
        for word in words:
            if word['score'] < score_threshold: #self.config['word_threshold']:
                continue
                
            if len(wordlist) == 0 or word['start_time'] > wordlist[-1]['end_time']:
                wordlist.append(word)
                continue

            overlap_word, overlap_idx = find_overlapping_word(wordlist, word)
            
            if overlap_word is None:
                continue

            if global_config.debug:
                print(f"found new '{word['text']}' ({word['start_time']}:{word['end_time']} {word['score']:.2f}) overlaps with '{overlap_word['text']}' ({overlap_word['start_time']}:{overlap_word['end_time']} {overlap_word['score']:.2f})")

            if word['score'] > overlap_word['score']:
                wordlist[overlap_idx] = word
                
    elif method == 'similarity':
        # find the most-similar past word to the first new word
        similarity_metric = np.inf #1000
        similarity_index = -1
        
        for idx in range(len(wordlist)-1, -1, -1):  # search in reverse so words early in the transcript aren't matched first
            similarity = nltk.edit_distance(words[0]['text'], wordlist[idx]['text'])
            
            if similarity < similarity_metric:
                similarity_metric = similarity
                similarity_index = idx
                
            if similarity == 0:
                break
           
        if global_config.debug:
            print(f"closest word to '{words[0]['text']}' is '{wordlist[similarity_index]['text']}' (similarity={similarity_metric}) ")
        
        wordlist = wordlist[:similarity_index]
        wordlist.extend(words)
        
    else:
        raise ValueError(f"invalid method '{method}' (valid options are 'overlap', 'similarity')")
        
    return wordlist
        
        
def split_words(wordlist, times):
    """
    Split the word list by the given times.
    note - these times should be sorted
    """
    wordlists = []

    for time in times:
        _, idx = find_word_after(wordlist, time)
        
        if idx < 0:
            wordlists.append(wordlist)
            return wordlists
            
        wordlists.append(wordlist[:idx])
        wordlist = wordlist[idx:]
        
    wordlists.append(wordlist)    
    return wordlists
        
        
def rebase_word_times(wordlist):
    """
    Re-base the word timings so that the start of the first word is zero.
    """
    if len(wordlist) == 0:
        return wordlist
        
    #wordlist = wordlist.copy()
    start_offset = wordlist[0]['start_time']
            
    for idx in range(len(wordlist)):
        wordlist[idx]['start_time'] -= start_offset
        wordlist[idx]['end_time'] -= start_offset
    
    return wordlist


def find_silent_intervals(logits, blank_symbol_id, min_silent_time, time_offset):
    """
    Find blank/silent regions in the output logits.
    """
    num_timesteps = logits.shape[0]
    silent_intervals = []
    last_interval_start = None
    
    for i in range(num_timesteps):
        argmax = np.argmax(logits[i])
        
        if argmax == blank_symbol_id:
            if last_interval_start is None:
                last_interval_start = i 
        
        if last_interval_start is not None and (argmax != blank_symbol_id or (i == num_timesteps-1)):
            if i - last_interval_start >= min_silent_time:
                silent_intervals.append((last_interval_start + time_offset, i-1+time_offset))
            #    print(f'     new silent interval ({last_interval_start + self.timestep}:{i-1+self.timestep}) {i - last_interval_start} > {min_length:.2f}')  
            #else:
            #    print(f'skipping silent interval ({last_interval_start + self.timestep}:{i-1+self.timestep}) {i - last_interval_start} < {min_length:.2f}')
                
            last_interval_start = None

    return silent_intervals
        


================================================
FILE: jetson_voice/models/nlp/__init__.py
================================================
#!/usr/bin/env python3
# coding: utf-8

from .intent_slot import IntentSlotEngine
from .question_answer import QuestionAnswerEngine
from .text_classification import TextClassificationEngine
from .token_classification import TokenClassificationEngine

================================================
FILE: jetson_voice/models/nlp/intent_slot.py
================================================
#!/usr/bin/env python3
# coding: utf-8

import os
import logging
import numpy as np

from transformers import AutoTokenizer

from jetson_voice.nlp import IntentSlotService
from jetson_voice.utils import load_model, normalize_logits
from .nlp_utils import find_subtokens, nlp_dynamic_shapes


class IntentSlotEngine(IntentSlotService):
    """
    Joint Intent and Slot classification model in TensorRT / onnxruntime.
    """
    def __init__(self, config, *args, **kwargs):
        """
        Load an Intent/Slot classification model from ONNX
        """
        super(IntentSlotEngine, self).__init__(config, *args, **kwargs)

        if self.config.type != 'intent_slot':
            raise ValueError(f"{self.config.model_path} isn't an Intent/Slot model (type '{self.config.type}'")
            
        # load model
        dynamic_shapes = {'max' : (1, self.config['language_model']['max_seq_length'])}  # (batch_size, sequence_length)
        
        if nlp_dynamic_shapes:
            dynamic_shapes['min'] = (1, 1)
        
        self.model = load_model(self.config, dynamic_shapes)
        
        # create tokenizer
        self.tokenizer = AutoTokenizer.from_pretrained(self.config['tokenizer']['tokenizer_name'])
        self.null_slot = self.slot_labels[-1]  # 'O' in assistant dataset - always the last label?
        
        
    def __call__(self, query):
        """
        Perform intent/slot classification on the input query.
        
        Parameters:
          query (string) -- The text query, for example:
                             'What is the weather in San Francisco tomorrow?'

        Returns a dict with the following keys:
             'intent' (string) -- the classified intent label
             'score' (float) -- the intent probability [0,1]
             'slots' (list[dict]) -- a list of dicts, where each dict has the following keys:
                  'slot' (string) -- the slot label
                  'text' (string) -- the slot text from the query
                  'score' (float) -- the slot probability [0,1]
        """
        encodings = self.tokenizer(
            text=query,
            padding='longest' if nlp_dynamic_shapes else 'max_length',
            truncation=True,
            max_length=self.config['language_model']['max_seq_length'],
            return_tensors='np',
            return_token_type_ids=True,
            return_overflowing_tokens=True,
            return_offsets_mapping=True,
            return_special_tokens_mask=True,
        )

        # during slot classification, we want to ignore slots from subtokens and special tokens 
        subtoken_mask = find_subtokens(encodings, method='subtoken_delimiters')
        ignore_mask = subtoken_mask | encodings['special_tokens_mask']
    
        # retrieve the inputs from the encoded tokens
        inputs = {}
        
        for input in self.model.inputs:
            if input.name not in encodings:
                raise ValueError(f"the encoded inputs from the tokenizer doesn't contain '{input.name}'")

            inputs[input.name] = encodings[input.name]
                    
        # run the model
        intent_logits, slot_logits = self.model.execute(inputs)

        intent_logits = normalize_logits(intent_logits)
        slot_logits = normalize_logits(slot_logits)

        intent_preds = np.argmax(intent_logits, axis=-1)
        slot_preds = np.argmax(slot_logits, axis=-1)

        # convert numerical outputs to intent/slot labels
        results = []

        for query_idx, intent_id in enumerate(intent_preds):
            results.append({
                'intent' : self.intent_label(intent_id),
                'score' : intent_logits[query_idx][intent_id],
                'slots' : []
            })
                
        for query_idx, slots in enumerate(slot_preds):
            query_slots = [self.slot_label(slot) for slot in slots]

            for token_idx, slot in enumerate(query_slots):
                # ignore unclassified slots or masked tokens
                if slot == self.null_slot or ignore_mask[query_idx][token_idx]:
                    continue
                    
                # convert from token index back to the query string
                chars = encodings.token_to_chars(query_idx, token_idx)
                text = query[chars[0]:chars[1]]      # queries[query_idx]
                
                # append subtokens from the query to the text
                for subtoken_idx in range(token_idx+1, len(query_slots)):
                    if subtoken_mask[query_idx][subtoken_idx]:
                        subtoken_chars = encodings.token_to_chars(query_idx, subtoken_idx)
                        text += query[subtoken_chars[0]:subtoken_chars[1]]
                    else:
                        break
                        
                results[query_idx]['slots'].append({
                    'slot' : slot,
                    'text' : text,
                    'score' : slot_logits[query_idx][token_idx][slots[token_idx]]
                })
        
        if len(results) == 1:
            return results[0]
        else:
            return results
            
    @property
    def intent_labels(self):
        """
        List of the intent class labels.
        """
        return self.config['data_desc']['intent_labels']
    
    def intent_label(self, index):
        """
        Return an intent label by index (with bounds checking)
        """
        return self.intent_labels[int(index)] if index < len(self.intent_labels) else 'Unknown_Intent'
        
    @property
    def slot_labels(self):
        """
        List of the slot class labels.
        """
        return self.config['data_desc']['slot_labels']
    
    def slot_label(self, index):
        """
        Return a slot label by index (with bounds checking)
        """
        return self.slot_labels[int(index)] if index < len(self.slot_labels) else self.null_slot
        

================================================
FILE: jetson_voice/models/nlp/nlp_utils.py
================================================
#!/usr/bin/env python3
# coding: utf-8

import numpy as np


# NLP BERT models (and BERT derivatives) have myelin problem with dynamic shapes on aarch64,
# so we disable dynamic shape changing for now (shapes will be set to the max sequence length)
nlp_dynamic_shapes=False


def find_subtokens(encodings, method='char_span'):
    """
    Compute the subtoken mask, where each token is marked as True if it's a subtoken or False otherwise.
    Longer words/acronyms may be tokenized into mulitple word pieces (called subtokens), for example:
    
        'Yosemite' -> ['yo', '##se', '##mite']
        'U.S.' -> ['u', '.', 's', '.']
    
    Parameters:
      encodings (BatchEncoding) -- Output from tokenizer
      
      method (string) -- If 'char_span', the subtoken mask will be determined by looking at the character
                         indices.  Tokens that map to characters that are side-by-side are flagged as subtokens.
                         
                         If 'subtoken_delimiters', subtokens will be identified by looking for '##' symbols.
                         However this can miss punctuated subtokens, such as 'U.S.'
    
    Returns boolean subtoken mask array with shape (num_queries, num_tokens)
    """
    num_queries = encodings['input_ids'].shape[0]
    subtoken_mask = []
    
    if method == 'char_span':
        for query_idx in range(num_queries):
            mask = []
            last_char = -1
            tokens = encodings.tokens(query_idx)
            
            for token_idx, word_id in enumerate(encodings.word_ids(query_idx)):
                if word_id is None:  # skip special tokens
                    mask.append(False)
                    continue
                    
                chars = encodings.token_to_chars(query_idx, token_idx)
                
                if chars[0] == last_char:
                    mask.append(True)
                else:
                    mask.append(False)
                    
                last_char = chars[1]

            subtoken_mask.append(mask)
            
    elif method == 'subtoken_delimiters':
        for query_idx in range(num_queries):
            subtoken_mask.append([token.startswith('##') for token in encodings.tokens(query_idx)])
    else:
        raise ValueError(f"invalid method ('{method}')")
        
    return np.asarray(subtoken_mask)
        

================================================
FILE: jetson_voice/models/nlp/question_answer.py
================================================
#!/usr/bin/env python3
# coding: utf-8

import os
import logging
import numpy as np

from transformers import AutoTokenizer

from jetson_voice.nlp import QuestionAnswerService
from jetson_voice.utils import load_model, normalize_logits
from .nlp_utils import nlp_dynamic_shapes


class QuestionAnswerEngine(QuestionAnswerService):
    """
    Question answering model in TensorRT / onnxruntime.
    """
    def __init__(self, config, *args, **kwargs):
        """
        Load an question answering model from ONNX
        """
        super(QuestionAnswerEngine, self).__init__(config, *args, **kwargs)

        if self.config.type != 'qa':
            raise ValueError(f"{self.config.model_path} isn't a Question Answering model (type '{self.config.type}'")
            
        # load model
        dynamic_shapes = {'max' : (1, self.config['dataset']['max_seq_length'])}  # (batch_size, sequence_length)
        
        if nlp_dynamic_shapes:
            dynamic_shapes['min'] = (1, 1)
        
        self.model = load_model(self.config, dynamic_shapes)
        
        # create tokenizer
        self.tokenizer = AutoTokenizer.from_pretrained(self.config['tokenizer']['tokenizer_name'])
        self.question_first = bool(self.tokenizer.padding_side == "right")
        
        
    def __call__(self, query, top_k=1):
        """
        Perform question/answering on the input query.
        
        Parameters:
          query (dict or tuple) -- Either a dict with 'question' and 'context' keys,
                                   or a (question, context) tuple.
          top_k (int) -- How many of the top results to return, sorted by score.
                         The default (top_k=1) is to return just the top result.
                         If top_k > 1, then a list of results will be returned.
          
        Returns:
          dict(s) with the following keys:
          
             'answer' (string) -- the answer text
             'score' (float) -- the probability [0,1]
             'start' (int) -- the starting character index of the answer into the context text
             'end' (int) -- the ending character index of the answer into the context text
             
          If top_k > 1, a list of dicts with the top_k results will be returned.
          If top_k == 1, just the single dict with the top score will be returned.
        """
        if isinstance(query, dict):
            question = query['question']
            context = query['context']
        elif isinstance(query, tuple):
            question = query[0]
            context = query[1]
        else:
            raise ValueError(f'query must be a dict or tuple (instead was type {type(query).__name__})')

        # check for models that have a doc_stride >= max_seq_length
        # this will cause an exception in the tokenizer
        doc_stride = self.config['dataset']['doc_stride']
        max_seq_len = self.config['dataset']['max_seq_length']
        
        if doc_stride >= max_seq_len:
            doc_stride = int(max_seq_len/2)
            
        # tokenize the inputs
        encodings = self.tokenizer(
            text=question if self.question_first else context,
            text_pair=context if self.question_first else question_text,
            padding='longest' if nlp_dynamic_shapes else 'max_length',
            truncation="only_second" if self.question_first else "only_first",
            max_length=max_seq_len,
            stride=doc_stride,
            return_tensors='np',
            return_token_type_ids=True,
            return_overflowing_tokens=True,
            return_offsets_mapping=True,
            return_special_tokens_mask=True,
        )
        
        # When the input is too long, it's converted in a batch of inputs with overflowing tokens
        # and a stride of overlap between the inputs. If a batch of inputs is given, a special output
        # "overflow_to_sample_mapping" indicate which member of the encoded batch belong to which original batch sample.
        # Here we tokenize examples one-by-one so we don't need to use "overflow_to_sample_mapping".
        # "num_span" is the number of output samples generated from the overflowing tokens.
        num_spans = len(encodings["input_ids"])
        logging.debug(f'num_spans: {num_spans}')

        # p_mask: mask with 1 for token than cannot be in the answer (0 for token which can be in an answer)
        # We put 0 on the tokens from the context and 1 everywhere else (question and special tokens)
        p_mask = np.asarray(
            [
                [tok != 1 if self.question_first else 0 for tok in encodings.sequence_ids(span_id)]
                for span_id in range(num_spans)
            ]
        )

        # keep the cls_token unmasked (some models use it to indicate unanswerable questions)
        if self.tokenizer.cls_token_id is not None:
            cls_index = np.nonzero(encodings["input_ids"] == self.tokenizer.cls_token_id)
            p_mask[cls_index] = 0
            
        # run the model over each span (TODO batching)
        model_outputs = []
        
        for span_idx in range(num_spans):
            inputs = {}
            
            for input in self.model.inputs:
                if input.name not in encodings:
                    raise ValueError(f"the encoded inputs from the tokenizer doesn't contain '{input.name}'")

                inputs[input.name] = np.expand_dims(encodings[input.name][span_idx], axis=0) # add batch dim

            model_outputs.append(self.model.execute(inputs))
            
        # post-processing
        answers = []
        min_null_score = 1000000
        handle_impossible_answer = self.config['dataset']['version_2_with_negative']
        
        for span_idx in range(num_spans):
            start_logits = np.squeeze(model_outputs[span_idx][:,:,0])
            end_logits = np.squeeze(model_outputs[span_idx][:,:,1])

            # Ensure padded tokens & question tokens cannot belong to the set of candidate answers.
            undesired_tokens = np.abs(p_mask[span_idx] - 1) & encodings['attention_mask'][span_idx]

            # Generate mask
            undesired_tokens_mask = (undesired_tokens == 0.0)

            # Make sure non-context indexes in the tensor cannot contribute to the softmax
            start_logits = np.where(undesired_tokens_mask, -10000.0, start_logits)
            end_logits = np.where(undesired_tokens_mask, -10000.0, end_logits)

            # Normalize logits and spans to retrieve the answer
            start_logits = np.exp(start_logits - np.log(np.sum(np.exp(start_logits), axis=-1, keepdims=True)))
            end_logits = np.exp(end_logits - np.log(np.sum(np.exp(end_logits), axis=-1, keepdims=True)))

            if handle_impossible_answer:
                min_null_score = min(min_null_score, (start_logits[0] * end_logits[0]).item())

            # Mask CLS
            start_logits[0] = end_logits[0] = 0.0

            # Decode token probabilities
            starts, ends, scores = self.decode(start_logits, end_logits, top_k=top_k)

            if self.tokenizer.is_fast:
                # Convert the answer (tokens) back to the original text
                # Score: score from the model
                # Start: Index of the first character of the answer in the context string
                # End: Index of the character following the last character of the answer in the context string
                # Answer: Plain text of the answer
                enc = encodings[span_idx]
                
                # Sometimes the max probability token is in the middle of a word so:
                # - we start by finding the right word containing the token with `token_to_word`
                # - then we convert this word in a character span with `word_to_chars`
                for s, e, score in zip(starts, ends, scores):
                    start = enc.word_to_chars(enc.token_to_word(s), sequence_index=1 if self.question_first else 0)[0]
                    end = enc.word_to_chars(enc.token_to_word(e), sequence_index=1 if self.question_first else 0)[1]
                    
                    answers.append({
                        'answer' : context[start : end],
                        'score' : score.item(),
                        'start' : start,
                        'end' : end
                    })
            else:
                raise NotImplementedError('QA post-processing is only implemented for fast tokenizers')
            
        if handle_impossible_answer:
            answers.append({'answer': '', 'score': min_null_score, 'start': 0, 'end': 0})

        answers = sorted(answers, key=lambda x: x['score'], reverse=True)[:top_k]
        
        if top_k == 1:
            return answers[0]
        else:
            return answers


    def decode(self, start: np.ndarray, end: np.ndarray, top_k: int):
        """
        Take the QA model output and will generate probabilities for each span to be the actual answer.
        In addition, it filters out some unwanted/impossible cases like answer len being greater than max_answer_len or
        answer end position being before the starting position. The method supports output the k-best answer through
        the top_k argument.
        Args:
            start (:obj:`np.ndarray`): Individual start probabilities for each token.
            end (:obj:`np.ndarray`): Individual end probabilities for each token.
            top_k (:obj:`int`): Indicates how many possible answer span(s) to extract from the model output.
            max_answer_len (:obj:`int`): Maximum size of the answer to extract from the model's output.
        """
        # Ensure we have batch axis
        if start.ndim == 1:
            start = start[None]

        if end.ndim == 1:
            end = end[None]

        # Compute the score of each tuple(start, end) to be the real answer
        outer = np.matmul(np.expand_dims(start, -1), np.expand_dims(end, 1))

        # Remove candidate with end < start and end - start > max_answer_len
        candidates = np.tril(np.triu(outer), self.config['dataset']['max_answer_length'] - 1)

        #  Inspired by Chen & al. (https://github.com/facebookresearch/DrQA)
        scores_flat = candidates.flatten()
        if top_k == 1:
            idx_sort = [np.argmax(scores_flat)]
        elif len(scores_flat) < top_k:
            idx_sort = np.argsort(-scores_flat)
        else:
            idx = np.argpartition(-scores_flat, top_k)[0:top_k]
            idx_sort = idx[np.argsort(-scores_flat[idx])]

        start, end = np.unravel_index(idx_sort, candidates.shape)[1:]
        return start, end, candidates[0, start, end] 
        

================================================
FILE: jetson_voice/models/nlp/text_classification.py
================================================
#!/usr/bin/env python3
# coding: utf-8

import os
import logging
import numpy as np

from transformers import AutoTokenizer

from jetson_voice.nlp import TextClassificationService
from jetson_voice.utils import load_model, normalize_logits
from .nlp_utils import nlp_dynamic_shapes


class TextClassificationEngine(TextClassificationService):
    """
    Text classification model in TensorRT / onnxruntime.
    """
    def __init__(self, config, *args, **kwargs):
        """
        Load an text classification model from ONNX
        """
        super(TextClassificationEngine, self).__init__(config, *args, **kwargs)

        if self.config.type != 'text_classification':
            raise ValueError(f"{self.config.model_path} isn't a Text Classification model (type '{self.config.type}'")
            
        # load model
        dynamic_shapes = {'max' : (1, self.config['dataset']['max_seq_length'])}  # (batch_size, sequence_length)
        
        if nlp_dynamic_shapes:
            dynamic_shapes['min'] = (1, 1)
        
        self.model = load_model(self.config, dynamic_shapes)
        
        # create tokenizer
        self.tokenizer = AutoTokenizer.from_pretrained(self.config['tokenizer']['tokenizer_name'])
        
        
    def __call__(self, query):
        """
        Perform text classification on the input query.
        
        Parameters:
          query (string) -- The text query, for example:
                             'Today was warm, sunny and beautiful out.'

        Returns a dict with the following keys:
             'class' (int) -- the predicted class index
             'label' (string) -- the predicted class label (and if there aren't labels `str(class)`)
             'score' (float) -- the classification probability [0,1]
        """
        encodings = self.tokenizer(
            text=query,
            padding='longest' if nlp_dynamic_shapes else 'max_length',
            truncation=True,
            max_length=self.config['dataset']['max_seq_length'],
            return_tensors='np',
            return_token_type_ids=True,
            return_overflowing_tokens=True,
            return_offsets_mapping=True,
            return_special_tokens_mask=True,
        )
    
        # retrieve the inputs from the encoded tokens
        inputs = {}
        
        for input in self.model.inputs:
            if input.name not in encodings:
                raise ValueError(f"the encoded inputs from the tokenizer doesn't contain '{input.name}'")

            inputs[input.name] = encodings[input.name]
                    
        # run the model
        logits = self.model.execute(inputs)
        logits = normalize_logits(logits)
        preds  = np.argmax(logits, axis=-1)
 
        # tabulate results
        results = []
        
        for query_idx in range(preds.shape[0]):
            results.append({
                'class' : int(preds[query_idx]),
                'label' : str(preds[query_idx]),
                'score' : logits[query_idx][preds[query_idx]]
            })
            
        if len(results) == 1:
            return results[0]
        else:
            return results
        

================================================
FILE: jetson_voice/models/nlp/token_classification.py
================================================
#!/usr/bin/env python3
# coding: utf-8

import os
import logging
import numpy as np

from transformers import AutoTokenizer

from jetson_voice.nlp import TokenClassificationService
from jetson_voice.utils import load_model, normalize_logits
from .nlp_utils import find_subtokens, nlp_dynamic_shapes


class TokenClassificationEngine(TokenClassificationService):
    """
    Token classification model (aka Named Entity Recognition) in TensorRT / onnxruntime.
    """
    def __init__(self, config, *args, **kwargs):
        """
        Load an token classification model for NER from ONNX
        """
        super(TokenClassificationEngine, self).__init__(config, *args, **kwargs)

        if self.config.type != 'token_classification':
            raise ValueError(f"{self.config.model_path} isn't a Token Classification model (type '{self.config.type}'")
            
        # load model
        dynamic_shapes = {'max' : (1, self.config['dataset']['max_seq_length'])}  # (batch_size, sequence_length)
        
        if nlp_dynamic_shapes:
            dynamic_shapes['min'] = (1, 1)
        
        self.model = load_model(self.config, dynamic_shapes)
        
        # create tokenizer
        self.tokenizer = AutoTokenizer.from_pretrained(self.config['tokenizer']['tokenizer_name'])
        
        
    def __call__(self, query):
        """
        Perform token classification (NER) on the input query and return tagged entities.
        
        Parameters:
          query (string) -- The text query, for example:
                             "Ben is from Chicago, a city in the state of Illinois, US'

        Returns a list[dict] of tagged entities with the following dictionary keys:
             'class' (int) -- the entity class index
             'label' (string) -- the entity class label
             'score' (float) -- the classification probability [0,1]
             'text'  (string) -- the corresponding text from the input query
             'start' (int) -- the starting character index of the text
             'end'   (int) -- the ending character index of the text
        """
        encodings = self.tokenizer(
            text=query,
            padding='longest' if nlp_dynamic_shapes else 'max_length',
            truncation=True,
            max_length=self.config['dataset']['max_seq_length'],
            return_tensors='np',
            return_token_type_ids=True,
            return_overflowing_tokens=True,
            return_offsets_mapping=True,
            return_special_tokens_mask=True,
        )
    
        # during token classification, we want to ignore slots from subtokens and special tokens 
        subtoken_mask = find_subtokens(encodings)
        ignore_mask = subtoken_mask | encodings['special_tokens_mask']
        
        # retrieve the inputs from the encoded tokens
        inputs = {}
        
        for input in self.model.inputs:
            if input.name not in encodings:
                raise ValueError(f"the encoded inputs from the tokenizer doesn't contain '{input.name}'")

            inputs[input.name] = encodings[input.name]
                    
        # run the model
        logits = self.model.execute(inputs)
        logits = normalize_logits(logits)
        
        preds = np.argmax(logits, axis=-1)
        probs = np.amax(logits, axis=-1)
        
        # tabulate results
        tags = []
        label_map = {v: k for k, v in self.config['label_ids'].items()}
        num_queries, num_tokens, _ = logits.shape
        
        assert num_queries == 1  # there should only be 1 input query currently
        
        for query_idx in range(num_queries):
            query_tags = []
            
            for token_idx in range(num_tokens):
                label = label_map[preds[query_idx][token_idx]]
                
                # ignore unclassified slots or masked tokens
                if label == self.config['dataset']['pad_label'] or ignore_mask[query_idx][token_idx]:
                    continue

                # convert from token index back to the query string
                chars = encodings.token_to_chars(query_idx, token_idx)
                
                # append subtokens from the query to the text
                for subtoken_idx in range(token_idx+1, num_tokens):
                    if subtoken_mask[query_idx][subtoken_idx]:
                        chars = (chars[0], encodings.token_to_chars(query_idx, subtoken_idx)[1])
                    else:
                        break

                text = query[chars[0]:chars[1]] # queries[query_idx]

                # strip out punctuation to attach the entity tag to the word not to a punctuation mark
                if not text[-1].isalpha():
                    text = text[:-1]
                    chars = (chars[0], chars[1]-1)
                        
                query_tags.append({
                    'label' : label,
                    'class' : preds[query_idx][token_idx],
                    'score' : probs[query_idx][token_idx],
                    'text' : text,
                    'start' : chars[0],
                    'end' : chars[1]
                })
                
            tags.append(query_tags)
            
        if len(tags) == 1:
            return tags[0]
        else:
            return tags
        

================================================
FILE: jetson_voice/models/tts/__init__.py
================================================
#!/usr/bin/env python3
# coding: utf-8

from .tts_engine import TTSEngine


================================================
FILE: jetson_voice/models/tts/tts_engine.py
================================================
#!/usr/bin/env python3
# coding: utf-8

import os
import re
import logging
import inflect

import numpy as np

from jetson_voice.tts import TTSService
from jetson_voice.utils import global_config, load_model, softmax

      
class TTSEngine(TTSService):
    """
    Text-to-speech synthesis.  This is actually a pipeline of two models,
    the generator model (which generates MEL spectrograms from tokens),
    and the vocoder (which outputs audio from MEL spectrograms)
    """
    def __init__(self, config, *args, **kwargs):
        """
        Loads a streaming ASR model from ONNX or serialized TensorRT engine.
        
        Parameters:
          model (string) -- path to ONNX model or serialized TensorRT engine/plan
          config (string) -- path to model configuration json (will be inferred from model path if empty)
        """
        super(TTSEngine, self).__init__(config, *args, **kwargs)

        if self.config.type != 'tts':
            raise ValueError(f"{self.config.model_path} isn't a Text-to-Speech model (type '{self.config.type}'")
            
        # load text->MEL generator model
        self.generator = load_model(self.config.generator)
        
        # load MEL->audio vocoder model
        features = self.config.vocoder.features
        
        dynamic_shapes = {
            'min' : (1, features, 1),
            'opt' : (1, features, 160), # ~5-6 words
            'max' : (1, features, 1024) # ~20-30 words?
        }
        
        self.vocoder = load_model(self.config.vocoder, dynamic_shapes=dynamic_shapes)
        
        # create map of symbol->ID embeddings
        self.symbol_to_id = {s: i for i, s in enumerate(self.get_symbols())}
        
        # create operators for num-to-word conversion
        self.number_regex = re.compile(r'\d+(?:,\d+)?')  # https://stackoverflow.com/a/16321189
        self.number_inflect = inflect.engine()
        
    def __call__(self, text):
        """
        Generate audio from text.
        
        Parameters:
          text (string) -- The phrase to convert to audio.

        Returns audio samples in a numpy array.
        """
        text = self.numbers_to_words(text)   # vocab doesn't include numbers, so convert them to words
        
        pad_symbol = ' '
        min_length = 6
        
        if text[-1].isalnum():      # end with punctuation, otherwise audio is cut-off
            text += pad_symbol
          
        if len(text) < min_length:  # WAR for cuDNN error on JetPack <= 4.5.x
            text = text.ljust(min_length, pad_symbol)
            
        # convert chars to symbol embeddings
        encoded_text = [self.symbol_to_id[s] for s in text.lower() if s in self.symbol_to_id]
        encoded_text = np.expand_dims(np.array(encoded_text, dtype=np.int64), axis=0)
        
        # generate MEL spectrogram + audio
        mels = self.generator.execute(encoded_text)[0]
        audio = self.vocoder.execute(mels)

        return audio.squeeze()
     
    def get_symbols(self):
        """
        Return a list of all the accepted character symbols / embeddings
        """
        _arpabet = [
          'AA', 'AA0', 'AA1', 'AA2', 'AE', 'AE0', 'AE1', 'AE2', 'AH', 'AH0', 'AH1', 'AH2',
          'AO', 'AO0', 'AO1', 'AO2', 'AW', 'AW0', 'AW1', 'AW2', 'AY', 'AY0', 'AY1', 'AY2',
          'B', 'CH', 'D', 'DH', 'EH', 'EH0', 'EH1', 'EH2', 'ER', 'ER0', 'ER1', 'ER2', 'EY',
          'EY0', 'EY1', 'EY2', 'F', 'G', 'HH', 'IH', 'IH0', 'IH1', 'IH2', 'IY', 'IY0', 'IY1',
          'IY2', 'JH', 'K', 'L', 'M', 'N', 'NG', 'OW', 'OW0', 'OW1', 'OW2', 'OY', 'OY0',
          'OY1', 'OY2', 'P', 'R', 'S', 'SH', 'T', 'TH', 'UH', 'UH0', 'UH1', 'UH2', 'UW',
          'UW0', 'UW1', 'UW2', 'V', 'W', 'Y', 'Z', 'ZH'
        ]
        _arpabet = ['@' + s for s in _arpabet]
        _pad = '_'
        _punctuation = '!\'(),.:;? '
        _special = '-'
        _letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz'
        symbols = list(_pad + _special + _punctuation + _letters) + _arpabet
        return symbols
     
    def numbers_to_words(self, text):
        """
        Convert instances of numbers to words in the text.
        For example:  "The answer is 42" -> "The answer is forty two."
        """
        number_tokens = self.number_regex.findall(text)
        
        for number_token in number_tokens:
            # TODO test/handle floating-point numbers
            word_text = self.number_inflect.number_to_words(number_token)              
            num_begin = text.index(number_token)

            # insert the words back at the old location
            text = text[:num_begin] + word_text + text[num_begin + len(number_token):]
            
        return text
        
    @property
    def sample_rate(self):
        """
        Get the output sample rate (e.g. 22050, 44100, ect)
        """
        return self.config['vocoder']['sample_rate']

================================================
FILE: jetson_voice/nlp.py
================================================
#!/usr/bin/env python3
# coding: utf-8

from jetson_voice.utils import load_resource


def NLP(resource, *args, **kwargs):
    """
    Factory for automatically loading NLP models or services.
    
    Returns an instance of:
        - IntentSlotService
        - QuestionAnswerService
        - TextClassificationService
        - TokenClassificationService
    """
    from jetson_voice.auto import AutoModel
    return AutoModel(resource, domain='nlp', *args, **kwargs)
    
    
def IntentSlot(resource, *args, **kwargs):
    """
    Loads a NLP joint intent/slot classifier service or model.
    See the IntentSlotService class for the signature that implementations use.
    """
    factory_map = {
        'tensorrt' : 'jetson_voice.models.nlp.IntentSlotEngine',
        'onnxruntime' : 'jetson_voice.models.nlp.IntentSlotEngine'
    }
    
    return load_resource(resource, factory_map, *args, **kwargs)

    
class IntentSlotService():
    """
    Intent/slot classifier service base class.
    """
    def __init__(self, config, *args, **kwargs):
        """
        Create service instance.
        """
        self.config = config
        
    def __call__(self, query):
        """
        Perform intent/slot classification on the input query.
        
        Parameters:
          query (string) -- The text query, for example:
                             'What is the weather in San Francisco tomorrow?'

        Returns a dict with the following keys:
             'intent' (string) -- the classified intent label
             'score' (float) -- the intent probability [0,1]
             'slots' (list[dict]) -- a list of dicts, where each dict has the following keys:
                  'slot' (string) -- the slot label
                  'text' (string) -- the slot text from the query
                  'score' (float) -- the slot probability [0,1]
        """
        pass

 
def QuestionAnswer(resource, *args, **kwargs):
    """
    Loads a NLP question answering service or model.
    See the QuestionAnswerService class for the signature that implementations use.
    """
    factory_map = {
        'tensorrt' : 'jetson_voice.models.nlp.QuestionAnswerEngine',
        'onnxruntime' : 'jetson_voice.models.nlp.QuestionAnswerEngine'
    }
    
    return load_resource(resource, factory_map, *args, **kwargs) 
        
   
class QuestionAnswerService():
    """
    Question answering service base class.
    """
    def __init__(self, config, *args, **kwargs):
        """
        Create service instance.
        """
        self.config = config
        
    def __call__(self, query, top_k=1):
        """
        Perform question/answering on the input query.
        
        Parameters:
          query (dict or tuple) -- Either a dict with 'question' and 'context' keys,
                                   or a (question, context) tuple.
          top_k (int) -- How many of the top results to return, sorted by score.
                         The default (topk=1) is to return just the top result.
                         If topk > 1, then a list of results will be returned.
          
        Returns:
          dict(s) with the following keys:
          
             'answer' (string) -- the answer text
             'score' (float) -- the probability [0,1]
             'start' (int) -- the starting character index of the answer into the context text
             'end' (int) -- the ending character index of the answer into the context text
             
          If top_k > 1, a list of dicts with the topk results will be returned.
          If top_k == 1, just the single dict with the top score will be returned.
        """
        pass
        

def TextClassification(resource, *args, **kwargs):
    """
    Loads a NLP text classification service or model.
    See the TextClassificationService class for the signature that implementations use.
    """
    factory_map = {
        'tensorrt' : 'jetson_voice.models.nlp.TextClassificationEngine',
        'onnxruntime' : 'jetson_voice.models.nlp.TextClassificationEngine'
    }
    
    return load_resource(resource, factory_map, *args, **kwargs) 
        
   
class TextClassificationService():
    """
    Text classification service base class.
    """
    def __init__(self, config, *args, **kwargs):
        """
        Create service instance.
        """
        self.config = config
        
    def __call__(self, query):
        """
        Perform text classification on the input query.
        
        Parameters:
          query (string) -- The text query, for example:
                             'Today was warm, sunny and beautiful out.'

        Returns a dict with the following keys:
             'class' (int) -- the predicted class index
             'label' (string) -- the predicted class label (and if there aren't labels `str(class)`)
             'score' (float) -- the classification probability [0,1]
        """
        pass


def TokenClassification(resource, *args, **kwargs):
    """
    Loads a NLP token classification (aka Named Entity Recognition) service or model.
    See the TokenClassificationService class for the signature that implementations use.
    """
    factory_map = {
        'tensorrt' : 'jetson_voice.models.nlp.TokenClassificationEngine',
        'onnxruntime' : 'jetson_voice.models.nlp.TokenClassificationEngine'
    }
    
    return load_resource(resource, factory_map, *args, **kwargs) 
        
   
class TokenClassificationService():
    """
    Token classification (aka Named Entity Recognition) service base class.
    """
    def __init__(self, config, *args, **kwargs):
        """
        Create service instance.
        """
        self.config = config
        
    def __call__(self, query):
        """
        Perform token classification (NER) on the input query and return tagged entities.
        
        Parameters:
          query (string) -- The text query, for example:
                             "Ben is from Chicago, a city in the state of Illinois, US'

        Returns a list[dict] of tagged entities with the following dictionary keys:
             'class' (int) -- the entity class index
             'label' (string) -- the entity class label
             'score' (float) -- the classification probability [0,1]
             'text'  (string) -- the corresponding text from the input query
             'start' (int) -- the starting character index of the text
             'end'   (int) -- the ending character index of the text
        """
        pass

    @staticmethod
    def tag_string(query, tags, scores=False):
        """
        Returns a string with the tags inserted inline with the query.  For example:
        
        "Ben[B-PER] is from Chicago[B-LOC], a city in the state of Illinois[B-LOC], US[B-LOC]"
        
        Parameters:
          query  (string) -- The original query string.
          tags   (list[dict]) -- The tags predicted by the model.
          scores (bool) -- If true, the probabilities will be added inline.
                           If false (default), only the tag labels will be added.
        """
        char_offset = 0

        for tag in tags:
            if scores:
                tag_str = f"[{tag['label']} {tag['score']:.3}]"
            else:
                tag_str = f"[{tag['label']}]"
                
            query = query[:tag['end'] + char_offset] + tag_str + query[tag['end'] + char_offset:]
            char_offset += len(tag_str)
            
        return query
        
        
if __name__ == "__main__":

    from jetson_voice import ConfigArgParser
    import pprint
    
    parser = ConfigArgParser()
    
    parser.add_argument('--model', default='distilbert_intent', type=str)
    parser.add_argument('--type', default='intent_slot', type=str)

    args = parser.parse_args()
    args.type = args.type.lower()
    
    print(args)
    
    if args.type == 'intent_slot':
    
        model = IntentSlot(args.model)
        
        # create some test queries
        queries = [
            'Set alarm for Seven Thirty AM',
            'Please increase the volume',
            'What is my schedule for tomorrow',
            'Place an order for a large pepperoni pizza from Dominos'
        ]

        # process the queries
        for query in queries:
            results = model(query)
            
            print('\n')
            print('query:', query)
            print('')
            pprint.pprint(results)
     
    elif args.type == 'question_answer' or args.type == 'qa':

        model = QuestionAnswer(args.model)
        
        # create some test queries
        queries = []
        
        queries.append({
            "question" : "What is the value of Pi?",
            "context" : "Some people have said that Pi is tasty but there should be a value for Pi, and the value for Pi is around 3.14. "
                        "Pi is the ratio of a circle's circumference to it's diameter. The constant Pi was first calculated by Archimedes "
                        "in ancient Greece around the year 250 BC."
        })
        
        queries.append({
            "question" : "Who discovered Pi?",
            "context" : queries[-1]['context']
        })

        queries.append({
            "question" : "Which nation contains the majority of the Amazon forest?",
            "context" : "The Amazon rainforest is a moist broadleaf forest that covers most of the Amazon basin of South America. "
                        "This basin encompasses 7,000,000 square kilometres (2,700,000 sq mi), of which 5,500,000 square kilometres "
                        "(2,100,000 sq mi) are covered by the rainforest. The majority of the forest is contained within Brazil, "
                        "with 60% of the rainforest, followed by Peru with 13%, and Colombia with 10%."
        })
        
        queries.append({
            "question" : "How large is the Amazon rainforest?",
            "context" : queries[-1]['context']
        })
        
        # process the queries
        for query in queries:
            answers = model(query, top_k=5)
            
            print('\n')
            print('context:', query['context'])
            print('')
            print('question:', query['question'])
            
            for answer in answers:
                print('')
                print('answer:  ', answer['answer'])
                print('score:   ', answer['score'])
    
    elif args.type == 'text_classification':
    
        model = TextClassification(args.model)
        
        # create some test queries (these are for sentiment models)
        queries = [
            "By the end of no such thing the audience, like beatrice, has a watchful affection for the monster.",
            "Director Rob Marshall went out gunning to make a great one.",
            "Uneasy mishmash of styles and genres.",
            "I love exotic science fiction / fantasy movies but this one was very unpleasant to watch. I gave it 4 / 10 since some special effects were nice.",
            "Today was cold and rainy and not very nice.",
            "Today was warm, sunny and beautiful out.",
        ]

        # process the queries
        for query in queries:
            results = model(query)
            print('\nquery:', query)
            pprint.pprint(results)
    
    elif args.type == 'token_classification':
    
        model = TokenClassification(args.model)
    
        # create some test queries
        queries = [
            "But candidate Charles Baker, who has about eight percent of the vote, has called for an investigation into reports of people voting multiple times.",
            "Analysts say Mr. Chung's comments may be part of efforts by South Korea to encourage North Korea to resume bilateral talks.",
            "The 63-year-old Daltrey walked offstage during the first song; guitarist Pete Townshend later told the crowd he was suffering from bronchitis and could barely speak.",
            "The Who is currently touring in support of Endless Wire, its first album since 1982.",
            "Meanwhile, Iowa is cleaning up after widespread flooding inundated homes, destroyed crops and cut off highways and bridges.",
            "At the White House Tuesday, U.S. President George Bush expressed concern for the flood victims.",
            "Ben is from Chicago, a city in the state of Illinois, US with a population of 2.7 million people.",
            "Lisa's favorite place to climb in the summer is El Capitan in Yosemite National Park in California, U.S."
        ]

        # process the queries
        for query in queries:
            tags = model(query)
            #print(f'\n{query}')
            #pprint.pprint(tags)
            print(f'\n{model.tag_string(query, tags, scores=True)}')
        
    else: 
        raise ValueError(f"invalid --type argument ({args.type})")
        

================================================
FILE: jetson_voice/tts.py
================================================
#!/usr/bin/env python3
# coding: utf-8

from jetson_voice.utils import load_resource


def TTS(resource, *args, **kwargs):
    """
    Loads a TTS service or model.
    See the TTSService class for the signature that implementations use.
    """
    factory_map = {
        'riva' : 'jetson_voice.backends.riva.RivaTTSService',
        'tensorrt' : 'jetson_voice.models.tts.TTSEngine',
        'onnxruntime' : 'jetson_voice.models.tts.TTSEngine'
    }
    
    return load_resource(resource, factory_map, *args, **kwargs)

    
class TTSService():
    """
    TTS service base class.
    """
    def __init__(self, config, *args, **kwargs):
        """
        Create service instance.
        """
        self.config = config
        
    def __call__(self, text):
        """
        Generate audio from text.
        
        Parameters:
          text (string) -- The phrase to convert to audio.

        Returns audio samples in a numpy array.
        """
        pass
    
    @property
    def sample_rate(self):
        """
        Get the output sample rate (in Hz)
        """
        pass
        
        
if __name__ == "__main__":

    from jetson_voice import list_audio_devices, ConfigArgParser
    from soundfile import SoundFile
    
    import pprint
    import pyaudio
    import time
    
    parser = ConfigArgParser()
    
    parser.add_argument('--model', default='fastpitch_hifigan', type=str)
    parser.add_argument('--text', default='Hello, how are you today?', type=str)
    parser.add_argument('--warmup', type=int, default=9, help='the number of warmup runs')
    parser.add_argument("--output-device", type=int, default=None, help='output audio device to use')
    parser.add_argument("--output-wav", type=str, default=None, help='output wav file to write to')
    parser.add_argument('--list-devices', action='store_true', help='list audio input devices')
    
    args = parser.parse_args()
    print(args)
    
    # list audio devices
    if args.list_devices:
        list_audio_devices()
        
    # load the model
    tts = TTS(args.model)
    
     # display the text
    print(f"\n'{args.text}'\n")
    
    # run the TTS
    for run in range(args.warmup+1):
        start = time.perf_counter()
        audio = tts(args.text)
        stop = time.perf_counter()
        latency = stop-start
        duration = audio.shape[0]/tts.sample_rate
        print(f"Run {run} -- Time to first audio: {latency:.3f}s. Generated {duration:.2f}s of audio. RTFx={duration/latency:.2f}.")
        
    # output the audio
    if args.output_device is not None:
        p = pyaudio.PyAudio()
        stream = p.open(output_device_index=args.output_device, 
                        format=pyaudio.paFloat32, 
                        channels=1, rate=tts.sample_rate, output=True)
        stream.write(audio.tobytes())
        stream.close_stream()
        stream.close()
        
    if args.output_wav is not None:
        wav = SoundFile(args.output_wav, mode='w', samplerate=tts.sample_rate, channels=1)
        wav.write(audio)
        wav.close()
        print(f"Wrote audio to {args.output_wav}")
    

================================================
FILE: jetson_voice/utils/__init__.py
================================================
#!/usr/bin/env python3
# coding: utf-8

from .config import global_config, ConfigDict, ConfigArgParser
from .resource import find_resource, load_resource, load_model, list_models

from .audio import *
from .softmax import softmax, normalize_logits

================================================
FILE: jetson_voice/utils/audio.py
================================================
#!/usr/bin/env python3
# coding: utf-8

import os
import math
import pprint
import logging
import librosa
import soundfile

import pyaudio as pa
import numpy as np


def audio_db(samples):
    """
    Compute RMS of audio samples in dB.
    """
    rms = librosa.feature.rms(y=samples, frame_length=samples.shape[0], center=False)[0][0]

    if rms != 0.0:
        return 20.0 * math.log10(rms)
    else:
        return -100.0
        
        
def audio_to_float(samples):
    """
    Convert audio samples to 32-bit float in the range [-1,1]
    """
    if samples.dtype == np.float32:
        return samples
        
    return samples.astype(np.float32) / 32768
  

def audio_to_int16(samples):
    """
    Convert audio samples to 16-bit float in the range [-32767,32767]
    """
    if samples.dtype == np.int16:
        return samples
    elif samples.dtype == np.float32:
        return (samples * 32768).astype(np.int16)
    else:
        return samples.astype(np.int16)
        
    
def AudioInput(wav=None, mic=None, sample_rate=16000, chunk_size=16000):
    """
    Create an audio input stream from wav file or microphone.
    Either the wav or mic argument needs to be specified.
    
    Parameters:
        wav (string) -- path to .wav file
        mic (int) -- microphone device index
        sample_rate (int) -- the desired sample rate in Hz
        chunk_size (int) -- the number of samples returned per next() iteration
        
    Returns AudioWavStream or AudioMicStream
    """
    if mic is not None and mic != '':
        return AudioMicStream(mic, sample_rate=sample_rate, chunk_size=chunk_size)
    elif wav is not None and wav != '':
        return AudioWavStream(wav, sample_rate=sample_rate, chunk_size=chunk_size)
    else:
        raise ValueError('either wav or mic argument must be specified')
 
 
class AudioWavStream:
    """
    Audio playback stream from .wav file
    """
    def __init__(self, filename, sample_rate, chunk_size):
        self.filename = filename
        self.chunk_size = chunk_size
        self.sample_rate = sample_rate
                
        if not os.path.isfile(filename):
            raise IOError(f'could not find file {filename}')
            
        logging.info(f"loading audio '{filename}'")
        
        self.samples, _ = librosa.load(filename, sr=sample_rate, mono=True)
        self.position = 0

    def open(self):
        pass
        
    def close(self):
        pass
        
    def reset(self):
        self.position = 0
        
    def next(self):
        if self.position >= len(self.samples):
            return None
        
        chunk = self.samples[self.position : min(self.position + self.chunk_size, len(self.samples))]
        
        if len(chunk) < self.chunk_size:
            chunk = np.pad(chunk, (0, self.chunk_size-len(chunk)), mode='constant')
            
        self.position += self.chunk_size
        return chunk
        
    def __next__(self):
        samples = self.next()
        
        if samples is None:
            raise StopIteration
        else:
            return samples
        
    def __iter__(self):
        self.position = 0
        return self


class AudioMicStream:
    """
    Live audio stream from microphone input device.
    """
    def __init__(self, device, sample_rate, chunk_size):
        self.stream = None
        self.interface = pa.PyAudio()
        
        self.device_info = find_audio_device(device, self.interface)
        self.device_id = self.device_info['index']
        self.device_sample_rate = sample_rate
        self.device_chunk_size = chunk_size
        
        self.sample_rate = sample_rate
        self.chunk_size = chunk_size
        
        print('Audio Input Device:')
        pprint.pprint(self.device_info)
    
    def __del__(self):
        self.close()
        self.interface.terminate()
        
    def open(self):
        if self.stream:
            return
        
        sample_rates = [self.sample_rate, int(self.device_info['defaultSampleRate']), 16000, 22050, 32000, 44100]
        chunk_sizes = []
        
        for sample_rate in sample_rates:
            chunk_sizes.append(int(self.chunk_size * sample_rate / self.sample_rate))
            
        for sample_rate, chunk_size in zip(sample_rates, chunk_sizes):
            try:    
                logging.info(f'trying to open audio input {self.device_id} with sample_rate={sample_rate} chunk_size={chunk_size}')
                
                self.stream = self.interface.open(format=pa.paInt16,
                                channels=1,
                                rate=sample_rate,
                                input=True,
                                input_device_index=self.device_id,
                                frames_per_buffer=chunk_size)
                                
                self.device_sample_rate = sample_rate
                self.device_chunk_size = chunk_size
                
                break
                
            except OSError as err:
                print(err)
                logging.warning(f'failed to open audio input {self.device_id} with sample_rate={sample_rate}')
                self.stream = None
                
        if self.stream is None:
            logging.error(f'failed to open audio input device {self.device_id} with any of these sample rates:')
            logging.error(str(sample_rates))
            raise ValueError(f"audio input device {self.device_id} couldn't be opened or does not support any of the above sample rates")
                      
        print(f"\naudio stream opened on device {self.device_id} ({self.device_info['name']})")
        print("you can begin speaking now... (press Ctrl+C to exit)\n")
            
    def close(self):
        if self.stream is not None:
            self.stream.stop_stream()
            self.stream.close()
            self.stream = None
     
    def reset(self):
        self.close()
        self.open()
        
    def next(self):
        self.open()
            
        samples = self.stream.read(self.device_chunk_size, exception_on_overflow=False)
        samples = np.frombuffer(samples, dtype=np.int16)
        
        if self.sample_rate != self.device_sample_rate:
            samples = audio_to_float(samples)
            samples = librosa.resample(samples, self.device_sample_rate, self.sample_rate)
            
            if len(samples) != self.chunk_size:
                logging.warning(f'resampled input audio has {len(samples)}, but expected {self.chunk_size} samples')
                
        return samples
        
    def __next__(self):
        samples = self.next()
        
        if samples is None:
            raise StopIteration
        else:
            return samples
        
    def __iter__(self):
        self.open()
        return self
        

class AudioOutput:
    """
    Audio output stream to a speaker.
    """
    def __init__(self, device, sample_rate, chunk_size=4096):
        self.stream = None
        
        if device is None:
            self.device_id = None
            logging.warning(f"creating pass-through audio output without a device")
            return
            
        self.interface = pa.PyAudio()
        self.device_info = find_audio_device(device, self.interface)
        self.device_id = self.device_info['index']
        self.chunk_size = chunk_size
        self.sample_rate = sample_rate
        self.requested_rate = sample_rate
        
        print('Audio Output Device:')
        pprint.pprint(self.device_info)
        
        self.open()
    
    def __del__(self):
        if self.device_id is None:
            return
            
        self.close()
        self.interface.terminate()
        
    def open(self):
        if self.stream or self.device_id is None:
            return
            
        try:
            self.stream = self.interface.open(format=pa.paFloat32,
                            channels=1, rate=self.sample_rate,
                            frames_per_buffer=self.chunk_size,
                            output=True, output_device_index=self.device_id)
        except:
            self.sample_rate = int(self.device_info['defaultSampleRate'])
            logging.error(f"failed to open audio output device with sample_rate={self.requested_rate}, trying again with sample_rate={self.sample_rate}")
            
            self.stream = self.interface.open(format=pa.paFloat32,
                            channels=1, rate=self.sample_rate,
                            frames_per_buffer=self.chunk_size,
                            output=True, output_device_index=self.device_id)
        
        logging.info(f"opened audio output device {self.device_id} ({self.device_info['name']})")
        
    def close(self):
        if self.stream is not None:
            self.stream.stop_stream()
            self.stream.close()
            self.stream = None
       
    def write(self, samples):
        if self.device_id is None:
            return
            
        self.open()
        samples = audio_to_float(samples)
        
        if self.requested_rate != self.sample_rate:
            samples = librosa.resample(samples, self.requested_rate, self.sample_rate)
            #wav = soundfile.SoundFile('data/audio/resample_test.wav', mode='w', samplerate=self.sample_rate, channels=1)
            #wav.write(samples)
            #wav.close()
            
        self.stream.write(samples.tobytes())
        
        
#
# device enumeration
# 
_audio_device_info = None

def _get_audio_devices(audio_interface=None):
    global _audio_device_info
    
    if _audio_device_info:
        return _audio_device_info
        
    if audio_interface:
        interface = audio_interface
    else:
        interface = pa.PyAudio()
        
    info = interface.get_host_api_info_by_index(0)
    numDevices = info.get('deviceCount')
    
    _audio_device_info = []
    
    for i in range(0, numDevices):
        _audio_device_info.append(interface.get_device_info_by_host_api_device_index(0, i))
    
    if not audio_interface:
        interface.terminate()
        
    return _audio_device_info
     
     
def find_audio_device(device, audio_interface=None):
    """
    Find an audio device by it's name or ID number.
    """
    devices = _get_audio_devices(audio_interface)
    
    try:
        device_id = int(device)
    except ValueError:
        if not isinstance(device, str):
            raise ValueError("expected either a string or an int for 'device' parameter")
            
        found = False
        
        for id, dev in enumerate(devices):
            if device.lower() == dev['name'].lower():
                device_id = id
                found = True
                break
                
        if not found:
            raise ValueError(f"could not find audio device with name '{device}'")
            
    if device_id < 0 or device_id >= len(devices):
        raise ValueError(f"invalid audio device ID ({device_id})")
        
    return devices[device_id]
                
   
def list_audio_inputs():
    """
    Print out information about present audio input devices.
    """
    devices = _get_audio_devices()

    print('')
    print('----------------------------------------------------')
    print(f" Audio Input Devices")
    print('----------------------------------------------------')
        
    for i, dev_info in enumerate(devices):    
        if (dev_info.get('maxInputChannels')) > 0:
            print("Input Device ID {:d} - '{:s}' (inputs={:.0f}) (sample_rate={:.0f})".format(i,
                  dev_info.get('name'), dev_info.get('maxInputChannels'), dev_info.get('defaultSampleRate')))
                 
    print('')
    
    
def list_audio_outputs():
    """
    Print out information about present audio output devices.
    """
    devices = _get_audio_devices()
    
    print('')
    print('----------------------------------------------------')
    print(f" Audio Output Devices")
    print('----------------------------------------------------')
        
    for i, dev_info in enumerate(devices):  
        if (dev_info.get('maxOutputChannels')) > 0:
            print("Output Device ID {:d} - '{:s}' (outputs={:.0f}) (sample_rate={:.0f})".format(i,
                  dev_info.get('name'), dev_info.get('maxOutputChannels'), dev_info.get('defaultSampleRate')))
                  
    print('')
    
    
def list_audio_devices():
    """
    Print out information about present audio input and output devices.
    """
    list_audio_inputs()
    list_audio_outputs()

              

              

================================================
FILE: jetson_voice/utils/config.py
================================================
#!/usr/bin/env python3
# coding: utf-8

import os
import json
import pprint
import logging
import argparse


#
# Default global configuration
#
# This can be overriden at runtime with command-line options (see ConfigArgParser)
# such as --global-config to load your own configuration from json file,
# or by calling config.load('my_config.json')
#
# You can also set the options directly on the 'config' object, e.g.
#
#    config.model_dir = '/path/to/my/models'
#    config.log_level = 'warning'
#
# It's recommended to use one of the methods above instead of changing _default_config directly.
#
_default_global_config = {
    'version' : 0.1,
    'model_dir' : '/jetson-voice/data/networks',
    'model_manifest' : '/jetson-voice/data/networks/manifest.json',
    'default_backend' : 'tensorrt',
    'log_level' : 'info',
    'debug' : False,
    'profile' : False
}


class ConfigDict(dict):
    """
    Configuration dict that can be loaded from JSON and has members
    accessible via attributes and can watch for updates to keys.
    """
    def __init__(self, *args, path=None, watch=None, **kwargs):
        """
        Parameters:
          path (str) -- Path to JSON file to load from
          
          watch (function or dict) -- A callback function that gets called when a key is set.
                                      Should a function signature like my_watch(key, value)
                                      This can also be a dict of key names and functions,
                                      and each function will only be called when it's particular
                                      key has been set.  You can also subclass ConfigDict and
                                      overrid

Download .txt

gitextract_8xzz9c2n/

├── .dockerignore
├── .gitignore
├── .gitmodules
├── Dockerfile.aarch64
├── Dockerfile.ros
├── Dockerfile.runtime
├── Dockerfile.x86_64
├── README.md
├── docker/
│   ├── build.sh
│   ├── push.sh
│   ├── run.sh
│   └── tag.sh
├── examples/
│   ├── asr.py
│   ├── assistant.py
│   ├── nlp.py
│   ├── nlp_qa.py
│   └── tts.py
├── jetson_voice/
│   ├── __init__.py
│   ├── asr.py
│   ├── auto.py
│   ├── backends/
│   │   ├── onnxruntime/
│   │   │   ├── __init__.py
│   │   │   └── ort_model.py
│   │   ├── riva/
│   │   │   ├── __init__.py
│   │   │   ├── riva_asr.py
│   │   │   └── riva_tts.py
│   │   └── tensorrt/
│   │       ├── __init__.py
│   │       ├── trt_binding.py
│   │       ├── trt_builder.py
│   │       └── trt_model.py
│   ├── models/
│   │   ├── __init__.py
│   │   ├── asr/
│   │   │   ├── __init__.py
│   │   │   ├── asr_engine.py
│   │   │   ├── ctc_beamsearch.py
│   │   │   ├── ctc_decoder.py
│   │   │   ├── ctc_greedy.py
│   │   │   └── ctc_utils.py
│   │   ├── nlp/
│   │   │   ├── __init__.py
│   │   │   ├── intent_slot.py
│   │   │   ├── nlp_utils.py
│   │   │   ├── question_answer.py
│   │   │   ├── text_classification.py
│   │   │   └── token_classification.py
│   │   └── tts/
│   │       ├── __init__.py
│   │       └── tts_engine.py
│   ├── nlp.py
│   ├── tts.py
│   └── utils/
│       ├── __init__.py
│       ├── audio.py
│       ├── config.py
│       ├── resource.py
│       └── softmax.py
├── patches/
│   ├── nemo/
│   │   ├── 1.0.0rc1/
│   │   │   ├── exportable.original.py
│   │   │   ├── exportable.py
│   │   │   ├── nlp/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── distilbert.diff
│   │   │   │   ├── distilbert.original.py
│   │   │   │   ├── distilbert.py
│   │   │   │   ├── huggingface_utils.py
│   │   │   │   ├── location.txt
│   │   │   │   └── mobilebert.py
│   │   │   ├── setup.original.py
│   │   │   └── setup.py
│   │   └── 1.6.2/
│   │       ├── requirements.original.txt
│   │       ├── requirements.txt
│   │       ├── requirements_nlp.original.txt
│   │       └── requirements_nlp.txt
│   ├── pytorch/
│   │   ├── 1.6.0/
│   │   │   ├── functional.diff
│   │   │   ├── functional.original.py
│   │   │   └── functional.py
│   │   └── 1.7.0/
│   │       ├── functional.diff
│   │       ├── functional.original.py
│   │       └── functional.py
│   └── transformers/
│       ├── 4.5.0/
│       │   ├── convert_graph_to_onnx.diff
│       │   ├── convert_graph_to_onnx.original.py
│       │   ├── convert_graph_to_onnx.py
│       │   └── modeling_distilbert.py
│       └── 4.5.1/
│           ├── convert_graph_to_onnx.diff
│           ├── convert_graph_to_onnx.original.py
│           ├── convert_graph_to_onnx.py
│           ├── modeling_distilbert.diff
│           ├── modeling_distilbert.original.py
│           └── modeling_distilbert.py
├── ros/
│   ├── CMakeLists.txt
│   ├── jetson_voice_ros/
│   │   ├── __init__.py
│   │   ├── asr.py
│   │   ├── audio_input.py
│   │   ├── audio_output.py
│   │   ├── nlp_intent_slot.py
│   │   ├── nlp_question_answer.py
│   │   └── tts.py
│   ├── launch/
│   │   ├── asr.launch.py
│   │   ├── audio_playback.launch.py
│   │   └── tts.launch.py
│   ├── msg/
│   │   ├── Audio.msg
│   │   ├── AudioInfo.msg
│   │   ├── IntentSlot.msg
│   │   ├── QuestionAnswerQuery.msg
│   │   ├── QuestionAnswerResult.msg
│   │   └── Slot.msg
│   └── package.xml
├── scripts/
│   ├── list_audio_devices.py
│   ├── list_models.py
│   ├── nemo_export_onnx.py
│   ├── nemo_list_models.py
│   ├── nemo_train_classifier.py
│   ├── nemo_train_intent.py
│   ├── nemo_train_ner.py
│   ├── nemo_train_qa.py
│   ├── os_version.sh
│   ├── record_mic.py
│   └── start_jupyter.sh
└── tests/
    ├── run_tests.py
    ├── test_asr.py
    ├── test_nlp.py
    └── test_tts.py

Download .txt

SYMBOL INDEX (589 symbols across 57 files)

FILE: examples/assistant.py
  function get_slot (line 51) | def get_slot(results, name, default='', threshold=0, merge=True):
  function generate_response (line 77) | def generate_response(query):

FILE: examples/nlp_qa.py
  function print_context (line 42) | def print_context():
  function parse_commands (line 46) | def parse_commands(entry):

FILE: jetson_voice/asr.py
  function ASR (line 7) | def ASR(resource, *args, **kwargs):
  class ASRService (line 21) | class ASRService():
    method __init__ (line 25) | def __init__(self, config, *args, **kwargs):
    method __call__ (line 28) | def __call__(self, samples):
    method classification (line 49) | def classification(self):
    method sample_rate (line 57) | def sample_rate(self):
    method frame_length (line 65) | def frame_length(self):
    method chunk_size (line 72) | def chunk_size(self):

FILE: jetson_voice/auto.py
  function AutoModel (line 11) | def AutoModel(resource, domain=None, *args, **kwargs):

FILE: jetson_voice/backends/onnxruntime/ort_model.py
  class OnnxRuntimeModel (line 16) | class OnnxRuntimeModel:
    method __init__ (line 20) | def __init__(self, config, *args, **kwargs):
    method execute (line 40) | def execute(self, inputs, return_dict=False, **kwargs):

FILE: jetson_voice/backends/riva/riva_asr.py
  class RivaASRService (line 18) | class RivaASRService(ASRService):
    method __init__ (line 22) | def __init__(self, config, *args, **kwargs):
    method __call__ (line 67) | def __call__(self, samples):
    method __next__ (line 99) | def __next__(self):
    method recieve_responses (line 111) | def recieve_responses(self):
    method sample_rate (line 140) | def sample_rate(self):
    method frame_length (line 148) | def frame_length(self):
    method chunk_size (line 155) | def chunk_size(self):

FILE: jetson_voice/backends/riva/riva_tts.py
  class RivaTTSService (line 16) | class RivaTTSService(TTSService):
    method __init__ (line 20) | def __init__(self, config, *args, **kwargs):
    method __call__ (line 37) | def __call__(self, text):
    method sample_rate (line 60) | def sample_rate(self):

FILE: jetson_voice/backends/tensorrt/trt_binding.py
  class Binding (line 11) | class Binding:
    method __init__ (line 15) | def __init__(self, model, index):
    method alloc (line 43) | def alloc(self, shape=None):
    method set_shape (line 61) | def set_shape(self, shape):
    method query_shape (line 85) | def query_shape(self):
    method __str__ (line 109) | def __str__(self):

FILE: jetson_voice/backends/tensorrt/trt_builder.py
  function build_engine (line 15) | def build_engine(config,

FILE: jetson_voice/backends/tensorrt/trt_model.py
  class TRTModel (line 20) | class TRTModel:
    method __init__ (line 24) | def __init__(self, config, dynamic_shapes=None, *args, **kwargs):
    method execute (line 81) | def execute(self, inputs, sync=True, return_dict=False, **kwargs):
    method find_binding (line 154) | def find_binding(self, name):
    method set_shape (line 164) | def set_shape(self, binding, shape):

FILE: jetson_voice/models/asr/asr_engine.py
  class ASREngine (line 19) | class ASREngine(ASRService):
    method __init__ (line 24) | def __init__(self, config, *args, **kwargs):
    method __call__ (line 140) | def __call__(self, samples):
    method classification (line 199) | def classification(self):
    method sample_rate (line 206) | def sample_rate(self):
    method frame_length (line 214) | def frame_length(self):
    method frame_overlap (line 221) | def frame_overlap(self):
    method chunk_size (line 228) | def chunk_size(self):
  function torch_to_numpy (line 235) | def torch_to_numpy(tensor):

FILE: jetson_voice/models/asr/ctc_beamsearch.py
  class CTCBeamSearchDecoder (line 16) | class CTCBeamSearchDecoder(CTCDecoder):
    method __init__ (line 20) | def __init__(self, config, vocab, resource_path=None):
    method decode (line 61) | def decode(self, logits):
    method reset (line 143) | def reset(self):
    method language_model (line 152) | def language_model(self):

FILE: jetson_voice/models/asr/ctc_decoder.py
  class CTCDecoder (line 5) | class CTCDecoder:
    method from_config (line 10) | def from_config(config, vocab, resource_path=None):
    method __init__ (line 29) | def __init__(self, config, vocab):
    method decode (line 41) | def decode(self, logits):
    method reset (line 59) | def reset(self):
    method set_timestep (line 65) | def set_timestep(self, timestep):
    method set_timestep_delta (line 71) | def set_timestep_delta(self, offset):
    method set_timestep_duration (line 77) | def set_timestep_duration(self, duration):
    method type (line 85) | def type(self):

FILE: jetson_voice/models/asr/ctc_greedy.py
  class CTCGreedyDecoder (line 13) | class CTCGreedyDecoder(CTCDecoder):
    method __init__ (line 17) | def __init__(self, config, vocab):
    method decode (line 36) | def decode(self, logits):
    method merge_chars (line 103) | def merge_chars(self, text, probs):
    method reset (line 147) | def reset(self):

FILE: jetson_voice/models/asr/ctc_utils.py
  function transcript_from_words (line 10) | def transcript_from_words(words, scores=False, times=False, end=False, a...
  function find_overlapping_word (line 36) | def find_overlapping_word(wordlist, word):
  function find_word_after (line 47) | def find_word_after(wordlist, time):
  function find_word_before (line 62) | def find_word_before(wordlist, time):
  function merge_words (line 77) | def merge_words(wordlist, words, score_threshold=-np.inf, method='overla...
  function split_words (line 141) | def split_words(wordlist, times):
  function rebase_word_times (line 162) | def rebase_word_times(wordlist):
  function find_silent_intervals (line 179) | def find_silent_intervals(logits, blank_symbol_id, min_silent_time, time...

FILE: jetson_voice/models/nlp/intent_slot.py
  class IntentSlotEngine (line 15) | class IntentSlotEngine(IntentSlotService):
    method __init__ (line 19) | def __init__(self, config, *args, **kwargs):
    method __call__ (line 41) | def __call__(self, query):
    method intent_labels (line 133) | def intent_labels(self):
    method intent_label (line 139) | def intent_label(self, index):
    method slot_labels (line 146) | def slot_labels(self):
    method slot_label (line 152) | def slot_label(self, index):

FILE: jetson_voice/models/nlp/nlp_utils.py
  function find_subtokens (line 12) | def find_subtokens(encodings, method='char_span'):

FILE: jetson_voice/models/nlp/question_answer.py
  class QuestionAnswerEngine (line 15) | class QuestionAnswerEngine(QuestionAnswerService):
    method __init__ (line 19) | def __init__(self, config, *args, **kwargs):
    method __call__ (line 41) | def __call__(self, query, top_k=1):
    method decode (line 198) | def decode(self, start: np.ndarray, end: np.ndarray, top_k: int):

FILE: jetson_voice/models/nlp/text_classification.py
  class TextClassificationEngine (line 15) | class TextClassificationEngine(TextClassificationService):
    method __init__ (line 19) | def __init__(self, config, *args, **kwargs):
    method __call__ (line 40) | def __call__(self, query):

FILE: jetson_voice/models/nlp/token_classification.py
  class TokenClassificationEngine (line 15) | class TokenClassificationEngine(TokenClassificationService):
    method __init__ (line 19) | def __init__(self, config, *args, **kwargs):
    method __call__ (line 40) | def __call__(self, query):

FILE: jetson_voice/models/tts/tts_engine.py
  class TTSEngine (line 15) | class TTSEngine(TTSService):
    method __init__ (line 21) | def __init__(self, config, *args, **kwargs):
    method __call__ (line 55) | def __call__(self, text):
    method get_symbols (line 85) | def get_symbols(self):
    method numbers_to_words (line 106) | def numbers_to_words(self, text):
    method sample_rate (line 124) | def sample_rate(self):

FILE: jetson_voice/nlp.py
  function NLP (line 7) | def NLP(resource, *args, **kwargs):
  function IntentSlot (line 21) | def IntentSlot(resource, *args, **kwargs):
  class IntentSlotService (line 34) | class IntentSlotService():
    method __init__ (line 38) | def __init__(self, config, *args, **kwargs):
    method __call__ (line 44) | def __call__(self, query):
  function QuestionAnswer (line 63) | def QuestionAnswer(resource, *args, **kwargs):
  class QuestionAnswerService (line 76) | class QuestionAnswerService():
    method __init__ (line 80) | def __init__(self, config, *args, **kwargs):
    method __call__ (line 86) | def __call__(self, query, top_k=1):
  function TextClassification (line 111) | def TextClassification(resource, *args, **kwargs):
  class TextClassificationService (line 124) | class TextClassificationService():
    method __init__ (line 128) | def __init__(self, config, *args, **kwargs):
    method __call__ (line 134) | def __call__(self, query):
  function TokenClassification (line 150) | def TokenClassification(resource, *args, **kwargs):
  class TokenClassificationService (line 163) | class TokenClassificationService():
    method __init__ (line 167) | def __init__(self, config, *args, **kwargs):
    method __call__ (line 173) | def __call__(self, query):
    method tag_string (line 192) | def tag_string(query, tags, scores=False):

FILE: jetson_voice/tts.py
  function TTS (line 7) | def TTS(resource, *args, **kwargs):
  class TTSService (line 21) | class TTSService():
    method __init__ (line 25) | def __init__(self, config, *args, **kwargs):
    method __call__ (line 31) | def __call__(self, text):
    method sample_rate (line 43) | def sample_rate(self):

FILE: jetson_voice/utils/audio.py
  function audio_db (line 15) | def audio_db(samples):
  function audio_to_float (line 27) | def audio_to_float(samples):
  function audio_to_int16 (line 37) | def audio_to_int16(samples):
  function AudioInput (line 49) | def AudioInput(wav=None, mic=None, sample_rate=16000, chunk_size=16000):
  class AudioWavStream (line 70) | class AudioWavStream:
    method __init__ (line 74) | def __init__(self, filename, sample_rate, chunk_size):
    method open (line 87) | def open(self):
    method close (line 90) | def close(self):
    method reset (line 93) | def reset(self):
    method next (line 96) | def next(self):
    method __next__ (line 108) | def __next__(self):
    method __iter__ (line 116) | def __iter__(self):
  class AudioMicStream (line 121) | class AudioMicStream:
    method __init__ (line 125) | def __init__(self, device, sample_rate, chunk_size):
    method __del__ (line 140) | def __del__(self):
    method open (line 144) | def open(self):
    method close (line 183) | def close(self):
    method reset (line 189) | def reset(self):
    method next (line 193) | def next(self):
    method __next__ (line 208) | def __next__(self):
    method __iter__ (line 216) | def __iter__(self):
  class AudioOutput (line 221) | class AudioOutput:
    method __init__ (line 225) | def __init__(self, device, sample_rate, chunk_size=4096):
    method __del__ (line 245) | def __del__(self):
    method open (line 252) | def open(self):
    method close (line 272) | def close(self):
    method write (line 278) | def write(self, samples):
  function _get_audio_devices (line 299) | def _get_audio_devices(audio_interface=None):
  function find_audio_device (line 324) | def find_audio_device(device, audio_interface=None):
  function list_audio_inputs (line 353) | def list_audio_inputs():
  function list_audio_outputs (line 372) | def list_audio_outputs():
  function list_audio_devices (line 391) | def list_audio_devices():

FILE: jetson_voice/utils/config.py
  class ConfigDict (line 36) | class ConfigDict(dict):
    method __init__ (line 41) | def __init__(self, *args, path=None, watch=None, **kwargs):
    method load (line 70) | def load(self, path, clear=False):
    method __getattr__ (line 87) | def __getattr__(self, attr):
    method __setattr__ (line 93) | def __setattr__(self, attr, value):
    method __setitem__ (line 99) | def __setitem__(self, key, value):
    method __watch__ (line 107) | def __watch__(self, key, value):
    method __str__ (line 119) | def __str__(self):
    method setdefault (line 125) | def setdefault(self, key, default=None):
    method update (line 136) | def update(self, *args, **kwargs):
  function _set_log_level (line 148) | def _set_log_level(key, value):
  class ConfigArgParser (line 179) | class ConfigArgParser(argparse.ArgumentParser):
    method __init__ (line 183) | def __init__(self, *args, **kwargs):
    method parse_args (line 200) | def parse_args(self, *args, **kwargs):

FILE: jetson_voice/utils/resource.py
  function find_resource (line 17) | def find_resource(path):
  function load_resource (line 38) | def load_resource(resource, factory_map, *args, **kwargs):
  function load_model (line 88) | def load_model(config, dynamic_shapes=None):
  function load_models_manifest (line 120) | def load_models_manifest(path=None):
  function find_model_manifest (line 139) | def find_model_manifest(name):
  function download_model (line 162) | def download_model(name, max_attempts=10, retry_time=5):
  function get_model_config_path (line 220) | def get_model_config_path(name=None, manifest=None):
  function list_models (line 239) | def list_models():

FILE: jetson_voice/utils/softmax.py
  function softmax (line 7) | def softmax(x, theta=1.0, axis=None):
  function normalize_logits (line 51) | def normalize_logits(logits):

FILE: patches/nemo/1.0.0rc1/exportable.original.py
  class ExportFormat (line 39) | class ExportFormat(Enum):
  class Exportable (line 52) | class Exportable(ABC):
    method get_format (line 59) | def get_format(filename: str):
    method input_module (line 67) | def input_module(self):
    method output_module (line 71) | def output_module(self):
    method get_input_names (line 74) | def get_input_names(self, input_example):
    method get_output_names (line 88) | def get_output_names(self, output_example):
    method get_input_dynamic_axes (line 102) | def get_input_dynamic_axes(self, input_names):
    method get_output_dynamic_axes (line 111) | def get_output_dynamic_axes(self, output_names):
    method export (line 120) | def export(
    method disabled_deployment_input_names (line 256) | def disabled_deployment_input_names(self):
    method disabled_deployment_output_names (line 261) | def disabled_deployment_output_names(self):
    method supported_export_formats (line 266) | def supported_export_formats(self):
    method _extract_dynamic_axes (line 271) | def _extract_dynamic_axes(name: str, ntype: NeuralType):
    method _prepare_for_export (line 294) | def _prepare_for_export(self, replace_1D_2D=False):

FILE: patches/nemo/1.0.0rc1/exportable.py
  class ExportFormat (line 39) | class ExportFormat(Enum):
  class Exportable (line 52) | class Exportable(ABC):
    method get_format (line 59) | def get_format(filename: str):
    method input_module (line 67) | def input_module(self):
    method output_module (line 71) | def output_module(self):
    method get_input_names (line 74) | def get_input_names(self, input_example):
    method get_output_names (line 88) | def get_output_names(self, output_example):
    method get_input_dynamic_axes (line 102) | def get_input_dynamic_axes(self, input_names):
    method get_output_dynamic_axes (line 111) | def get_output_dynamic_axes(self, output_names):
    method export (line 120) | def export(
    method disabled_deployment_input_names (line 274) | def disabled_deployment_input_names(self):
    method disabled_deployment_output_names (line 279) | def disabled_deployment_output_names(self):
    method supported_export_formats (line 284) | def supported_export_formats(self):
    method _extract_dynamic_axes (line 289) | def _extract_dynamic_axes(name: str, ntype: NeuralType):
    method _prepare_for_export (line 312) | def _prepare_for_export(self, replace_1D_2D=False):

FILE: patches/nemo/1.0.0rc1/nlp/distilbert.original.py
  class DistilBertEncoder (line 25) | class DistilBertEncoder(DistilBertModel, BertModule):
    method forward (line 31) | def forward(self, input_ids, attention_mask, token_type_ids=None):

FILE: patches/nemo/1.0.0rc1/nlp/distilbert.py
  class DistilBertEncoder (line 27) | class DistilBertEncoder(DistilBertModel, BertModule):
    method input_types (line 33) | def input_types(self) -> Optional[Dict[str, NeuralType]]:
    method forward (line 55) | def forward(self, input_ids, attention_mask, token_type_ids=None):

FILE: patches/nemo/1.0.0rc1/nlp/huggingface_utils.py
  function get_huggingface_lm_model (line 77) | def get_huggingface_lm_model(
  function get_huggingface_pretrained_lm_models_list (line 118) | def get_huggingface_pretrained_lm_models_list(include_external: bool = F...

FILE: patches/nemo/1.0.0rc1/nlp/mobilebert.py
  class MobileBertEncoder (line 25) | class MobileBertEncoder(MobileBertModel, BertModule):
    method forward (line 31) | def forward(self, input_ids, attention_mask, token_type_ids):

FILE: patches/nemo/1.0.0rc1/setup.original.py
  function is_build_action (line 31) | def is_build_action():
  function req_file (line 80) | def req_file(filename, folder="requirements"):
  class StyleCommand (line 153) | class StyleCommand(distutils_cmd.Command):
    method __call_checker (line 168) | def __call_checker(self, base_command, scope, check):
    method _isort (line 184) | def _isort(self, scope, check):
    method _black (line 187) | def _black(self, scope, check):
    method _pass (line 190) | def _pass(self):
    method _fail (line 193) | def _fail(self):
    method initialize_options (line 197) | def initialize_options(self):
    method run (line 201) | def run(self):
    method finalize_options (line 212) | def finalize_options(self):

FILE: patches/nemo/1.0.0rc1/setup.py
  function is_build_action (line 31) | def is_build_action():
  function req_file (line 80) | def req_file(filename, folder="requirements"):
  class StyleCommand (line 153) | class StyleCommand(distutils_cmd.Command):
    method __call_checker (line 168) | def __call_checker(self, base_command, scope, check):
    method _isort (line 184) | def _isort(self, scope, check):
    method _black (line 187) | def _black(self, scope, check):
    method _pass (line 190) | def _pass(self):
    method _fail (line 193) | def _fail(self):
    method initialize_options (line 197) | def initialize_options(self):
    method run (line 201) | def run(self):
    method finalize_options (line 212) | def finalize_options(self):

FILE: patches/pytorch/1.6.0/functional.original.py
  function broadcast_tensors (line 36) | def broadcast_tensors(*tensors):
  function split (line 68) | def split(tensor, split_size_or_sections, dim=0):
  function _indices_product (line 118) | def _indices_product(indices):
  function _index_tensor_with_indices_list (line 130) | def _index_tensor_with_indices_list(tensor, indices):
  function lu_unpack (line 137) | def lu_unpack(LU_data, LU_pivots, unpack_data=True, unpack_pivots=True):
  function einsum (line 243) | def einsum(equation, *operands):
  function meshgrid (line 330) | def meshgrid(*tensors):
  function stft (line 368) | def stft(input, n_fft, hop_length=None, win_length=None, window=None,
  function istft (line 468) | def istft(input, n_fft, hop_length=None, win_length=None, window=None,
  function _unique_impl (line 533) | def _unique_impl(input, sorted=True, return_inverse=False, return_counts...
  function _unique_consecutive_impl (line 617) | def _unique_consecutive_impl(input, return_inverse=False, return_counts=...
  function _return_counts (line 678) | def _return_counts(input, sorted=True, return_inverse=False, return_coun...
  function _return_output (line 688) | def _return_output(input, sorted=True, return_inverse=False, return_coun...
  function _return_inverse (line 698) | def _return_inverse(input, sorted=True, return_inverse=False, return_cou...
  function _consecutive_return_counts (line 740) | def _consecutive_return_counts(input, return_inverse=False, return_count...
  function _consecutive_return_output (line 750) | def _consecutive_return_output(input, return_inverse=False, return_count...
  function _consecutive_return_inverse (line 760) | def _consecutive_return_inverse(input, return_inverse=False, return_coun...
  function tensordot (line 802) | def tensordot(a, b, dims=2):
  function cartesian_prod (line 861) | def cartesian_prod(*tensors):
  function block_diag (line 894) | def block_diag(*tensors):
  function cdist (line 929) | def cdist(x1, x2, p=2., compute_mode='use_mm_for_euclid_dist_if_necessar...
  function norm (line 986) | def norm(input, p="fro", dim=None, keepdim=False, out=None, dtype=None):...
  function norm (line 991) | def norm(input, p="fro", dim=None, keepdim=False, out=None, dtype=None):...
  function norm (line 996) | def norm(input, p="fro", dim=None, keepdim=False, out=None, dtype=None):...
  function norm (line 1001) | def norm(input, p="fro", dim=None, keepdim=False, out=None, dtype=None):...
  function norm (line 1005) | def norm(input, p="fro", dim=None, keepdim=False, out=None, dtype=None):...
  function chain_matmul (line 1132) | def chain_matmul(*matrices):
  function _lu_impl (line 1167) | def _lu_impl(A, pivot=True, get_infos=False, out=None):
  function _check_list_size (line 1241) | def _check_list_size(out_len, get_infos, out):
  function _lu_with_infos (line 1251) | def _lu_with_infos(A, pivot=True, get_infos=False, out=None):
  function _lu_no_infos (line 1266) | def _lu_no_infos(A, pivot=True, get_infos=False, out=None):
  function align_tensors (line 1294) | def align_tensors(*tensors):

FILE: patches/pytorch/1.6.0/functional.py
  function broadcast_tensors (line 39) | def broadcast_tensors(*tensors):
  function split (line 71) | def split(tensor, split_size_or_sections, dim=0):
  function _indices_product (line 121) | def _indices_product(indices):
  function _index_tensor_with_indices_list (line 133) | def _index_tensor_with_indices_list(tensor, indices):
  function lu_unpack (line 140) | def lu_unpack(LU_data, LU_pivots, unpack_data=True, unpack_pivots=True):
  function einsum (line 246) | def einsum(equation, *operands):
  function meshgrid (line 333) | def meshgrid(*tensors):
  function stft (line 371) | def stft(input, n_fft, hop_length=None, win_length=None, window=None,
  function istft (line 481) | def istft(input, n_fft, hop_length=None, win_length=None, window=None,
  function _unique_impl (line 546) | def _unique_impl(input, sorted=True, return_inverse=False, return_counts...
  function _unique_consecutive_impl (line 630) | def _unique_consecutive_impl(input, return_inverse=False, return_counts=...
  function _return_counts (line 691) | def _return_counts(input, sorted=True, return_inverse=False, return_coun...
  function _return_output (line 701) | def _return_output(input, sorted=True, return_inverse=False, return_coun...
  function _return_inverse (line 711) | def _return_inverse(input, sorted=True, return_inverse=False, return_cou...
  function _consecutive_return_counts (line 753) | def _consecutive_return_counts(input, return_inverse=False, return_count...
  function _consecutive_return_output (line 763) | def _consecutive_return_output(input, return_inverse=False, return_count...
  function _consecutive_return_inverse (line 773) | def _consecutive_return_inverse(input, return_inverse=False, return_coun...
  function tensordot (line 815) | def tensordot(a, b, dims=2):
  function cartesian_prod (line 874) | def cartesian_prod(*tensors):
  function block_diag (line 907) | def block_diag(*tensors):
  function cdist (line 942) | def cdist(x1, x2, p=2., compute_mode='use_mm_for_euclid_dist_if_necessar...
  function norm (line 999) | def norm(input, p="fro", dim=None, keepdim=False, out=None, dtype=None):...
  function norm (line 1004) | def norm(input, p="fro", dim=None, keepdim=False, out=None, dtype=None):...
  function norm (line 1009) | def norm(input, p="fro", dim=None, keepdim=False, out=None, dtype=None):...
  function norm (line 1014) | def norm(input, p="fro", dim=None, keepdim=False, out=None, dtype=None):...
  function norm (line 1018) | def norm(input, p="fro", dim=None, keepdim=False, out=None, dtype=None):...
  function chain_matmul (line 1145) | def chain_matmul(*matrices):
  function _lu_impl (line 1180) | def _lu_impl(A, pivot=True, get_infos=False, out=None):
  function _check_list_size (line 1254) | def _check_list_size(out_len, get_infos, out):
  function _lu_with_infos (line 1264) | def _lu_with_infos(A, pivot=True, get_infos=False, out=None):
  function _lu_no_infos (line 1279) | def _lu_no_infos(A, pivot=True, get_infos=False, out=None):
  function align_tensors (line 1307) | def align_tensors(*tensors):

FILE: patches/pytorch/1.7.0/functional.original.py
  function broadcast_tensors (line 42) | def broadcast_tensors(*tensors):
  function split (line 74) | def split(tensor, split_size_or_sections, dim=0):
  function _indices_product (line 131) | def _indices_product(indices: _Indices) -> List[List[int]]:
  function _index_tensor_with_indices_list (line 143) | def _index_tensor_with_indices_list(tensor, indices):
  function lu_unpack (line 151) | def lu_unpack(LU_data, LU_pivots, unpack_data=True, unpack_pivots=True):
  function einsum (line 261) | def einsum(equation, *operands):
  function meshgrid (line 349) | def meshgrid(*tensors: Union[Tensor, List[Tensor]]) -> Tuple[Tensor, ...]:
  function meshgrid (line 352) | def meshgrid(*tensors):
  function _meshgrid (line 356) | def _meshgrid(*tensors):
  function stft (line 394) | def stft(input: Tensor, n_fft: int, hop_length: Optional[int] = None,
  function istft (line 518) | def istft(input: Tensor, n_fft: int, hop_length: Optional[int] = None,
  function _unique_impl (line 602) | def _unique_impl(input: Tensor, sorted: bool = True,
  function _unique_consecutive_impl (line 687) | def _unique_consecutive_impl(input: Tensor, return_inverse: bool = False,
  function _return_counts (line 749) | def _return_counts(input, sorted=True, return_inverse=False, return_coun...
  function _return_output (line 760) | def _return_output(input, sorted=True, return_inverse=False, return_coun...
  function _return_inverse (line 771) | def _return_inverse(input, sorted=True, return_inverse=False, return_cou...
  function _consecutive_return_counts (line 814) | def _consecutive_return_counts(input, return_inverse=False, return_count...
  function _consecutive_return_output (line 825) | def _consecutive_return_output(input, return_inverse=False, return_count...
  function _consecutive_return_inverse (line 836) | def _consecutive_return_inverse(input, return_inverse=False, return_coun...
  function tensordot (line 879) | def tensordot(a, b, dims=2):
  function cartesian_prod (line 938) | def cartesian_prod(*tensors):
  function block_diag (line 971) | def block_diag(*tensors):
  function cdist (line 1006) | def cdist(x1, x2, p=2., compute_mode='use_mm_for_euclid_dist_if_necessar...
  function atleast_1d (line 1061) | def atleast_1d(*tensors):
  function atleast_2d (line 1095) | def atleast_2d(*tensors):
  function atleast_3d (line 1130) | def atleast_3d(*tensors):
  function norm (line 1191) | def norm(input, p="fro", dim=None, keepdim=False, out=None, dtype=None):...
  function norm (line 1196) | def norm(input, p="fro", dim=None, keepdim=False, out=None, dtype=None):...
  function norm (line 1201) | def norm(input, p="fro", dim=None, keepdim=False, out=None, dtype=None):...
  function norm (line 1206) | def norm(input, p="fro", dim=None, keepdim=False, out=None, dtype=None):...
  function norm (line 1211) | def norm(input, p="fro", dim=None, keepdim=False, out=None, dtype=None):...
  function chain_matmul (line 1346) | def chain_matmul(*matrices):
  function _lu_impl (line 1381) | def _lu_impl(A, pivot=True, get_infos=False, out=None):
  function _check_list_size (line 1461) | def _check_list_size(out_len: int, get_infos: bool, out: _ListOrSeq) -> ...
  function _lu_with_infos (line 1468) | def _lu_with_infos(A, pivot=True, get_infos=False, out=None):
  function _lu_no_infos (line 1483) | def _lu_no_infos(A, pivot=True, get_infos=False, out=None):
  function align_tensors (line 1511) | def align_tensors(*tensors):

FILE: patches/pytorch/1.7.0/functional.py
  function broadcast_tensors (line 45) | def broadcast_tensors(*tensors):
  function split (line 77) | def split(tensor, split_size_or_sections, dim=0):
  function _indices_product (line 134) | def _indices_product(indices: _Indices) -> List[List[int]]:
  function _index_tensor_with_indices_list (line 146) | def _index_tensor_with_indices_list(tensor, indices):
  function lu_unpack (line 154) | def lu_unpack(LU_data, LU_pivots, unpack_data=True, unpack_pivots=True):
  function einsum (line 264) | def einsum(equation, *operands):
  function meshgrid (line 352) | def meshgrid(*tensors: Union[Tensor, List[Tensor]]) -> Tuple[Tensor, ...]:
  function meshgrid (line 355) | def meshgrid(*tensors):
  function _meshgrid (line 359) | def _meshgrid(*tensors):
  function stft (line 397) | def stft(input: Tensor, n_fft: int, hop_length: Optional[int] = None,
  function istft (line 530) | def istft(input: Tensor, n_fft: int, hop_length: Optional[int] = None,
  function _unique_impl (line 614) | def _unique_impl(input: Tensor, sorted: bool = True,
  function _unique_consecutive_impl (line 699) | def _unique_consecutive_impl(input: Tensor, return_inverse: bool = False,
  function _return_counts (line 761) | def _return_counts(input, sorted=True, return_inverse=False, return_coun...
  function _return_output (line 772) | def _return_output(input, sorted=True, return_inverse=False, return_coun...
  function _return_inverse (line 783) | def _return_inverse(input, sorted=True, return_inverse=False, return_cou...
  function _consecutive_return_counts (line 826) | def _consecutive_return_counts(input, return_inverse=False, return_count...
  function _consecutive_return_output (line 837) | def _consecutive_return_output(input, return_inverse=False, return_count...
  function _consecutive_return_inverse (line 848) | def _consecutive_return_inverse(input, return_inverse=False, return_coun...
  function tensordot (line 891) | def tensordot(a, b, dims=2):
  function cartesian_prod (line 950) | def cartesian_prod(*tensors):
  function block_diag (line 983) | def block_diag(*tensors):
  function cdist (line 1018) | def cdist(x1, x2, p=2., compute_mode='use_mm_for_euclid_dist_if_necessar...
  function atleast_1d (line 1073) | def atleast_1d(*tensors):
  function atleast_2d (line 1107) | def atleast_2d(*tensors):
  function atleast_3d (line 1142) | def atleast_3d(*tensors):
  function norm (line 1203) | def norm(input, p="fro", dim=None, keepdim=False, out=None, dtype=None):...
  function norm (line 1208) | def norm(input, p="fro", dim=None, keepdim=False, out=None, dtype=None):...
  function norm (line 1213) | def norm(input, p="fro", dim=None, keepdim=False, out=None, dtype=None):...
  function norm (line 1218) | def norm(input, p="fro", dim=None, keepdim=False, out=None, dtype=None):...
  function norm (line 1223) | def norm(input, p="fro", dim=None, keepdim=False, out=None, dtype=None):...
  function chain_matmul (line 1358) | def chain_matmul(*matrices):
  function _lu_impl (line 1393) | def _lu_impl(A, pivot=True, get_infos=False, out=None):
  function _check_list_size (line 1473) | def _check_list_size(out_len: int, get_infos: bool, out: _ListOrSeq) -> ...
  function _lu_with_infos (line 1480) | def _lu_with_infos(A, pivot=True, get_infos=False, out=None):
  function _lu_no_infos (line 1495) | def _lu_no_infos(A, pivot=True, get_infos=False, out=None):
  function align_tensors (line 1523) | def align_tensors(*tensors):

FILE: patches/transformers/4.5.0/convert_graph_to_onnx.original.py
  class OnnxConverterArgumentParser (line 45) | class OnnxConverterArgumentParser(ArgumentParser):
    method __init__ (line 50) | def __init__(self):
  function generate_identified_filename (line 91) | def generate_identified_filename(filename: Path, identifier: str) -> Path:
  function check_onnxruntime_requirements (line 104) | def check_onnxruntime_requirements(minimum_version: Version):
  function ensure_valid_input (line 133) | def ensure_valid_input(model, tokens, input_names):
  function infer_shapes (line 161) | def infer_shapes(nlp: Pipeline, framework: str) -> Tuple[List[str], List...
  function load_graph_from_args (line 225) | def load_graph_from_args(
  function convert_pytorch (line 256) | def convert_pytorch(nlp: Pipeline, opset: int, output: Path, use_externa...
  function convert_tensorflow (line 295) | def convert_tensorflow(nlp: Pipeline, opset: int, output: Path):
  function convert (line 332) | def convert(
  function optimize (line 376) | def optimize(onnx_model_path: Path) -> Path:
  function quantize (line 401) | def quantize(onnx_model_path: Path) -> Path:
  function verify (line 438) | def verify(path: Path):

FILE: patches/transformers/4.5.0/convert_graph_to_onnx.py
  class OnnxConverterArgumentParser (line 48) | class OnnxConverterArgumentParser(ArgumentParser):
    method __init__ (line 53) | def __init__(self):
  function generate_identified_filename (line 99) | def generate_identified_filename(filename: Path, identifier: str) -> Path:
  function check_onnxruntime_requirements (line 112) | def check_onnxruntime_requirements(minimum_version: Version):
  function ensure_valid_input (line 141) | def ensure_valid_input(model, tokens, input_names):
  function infer_shapes (line 169) | def infer_shapes(nlp: Pipeline, framework: str) -> Tuple[List[str], List...
  function load_graph_from_args (line 233) | def load_graph_from_args(
  function convert_pytorch (line 264) | def convert_pytorch(nlp: Pipeline, opset: int, output: Path, use_externa...
  function convert_tensorflow (line 311) | def convert_tensorflow(nlp: Pipeline, opset: int, output: Path):
  function convert (line 348) | def convert(
  function optimize (line 409) | def optimize(onnx_model_path: Path) -> Path:
  function quantize (line 434) | def quantize(onnx_model_path: Path) -> Path:
  function verify (line 471) | def verify(path: Path):

FILE: patches/transformers/4.5.0/modeling_distilbert.py
  function create_sinusoidal_embeddings (line 74) | def create_sinusoidal_embeddings(n_pos, dim, out):
  class Embeddings (line 82) | class Embeddings(nn.Module):
    method __init__ (line 83) | def __init__(self, config):
    method forward (line 95) | def forward(self, input_ids):
  class MultiHeadSelfAttention (line 116) | class MultiHeadSelfAttention(nn.Module):
    method __init__ (line 117) | def __init__(self, config):
    method prune_heads (line 133) | def prune_heads(self, heads):
    method forward (line 148) | def forward(self, query, key, value, mask, head_mask=None, output_atte...
  class FFN (line 203) | class FFN(nn.Module):
    method __init__ (line 204) | def __init__(self, config):
    method forward (line 214) | def forward(self, input):
    method ff_chunk (line 217) | def ff_chunk(self, input):
  class TransformerBlock (line 225) | class TransformerBlock(nn.Module):
    method __init__ (line 226) | def __init__(self, config):
    method forward (line 237) | def forward(self, x, attn_mask=None, head_mask=None, output_attentions...
  class Transformer (line 273) | class Transformer(nn.Module):
    method __init__ (line 274) | def __init__(self, config):
    method forward (line 281) | def forward(
  class DistilBertPreTrainedModel (line 330) | class DistilBertPreTrainedModel(PreTrainedModel):
    method _init_weights (line 340) | def _init_weights(self, module):
  class DistilBertModel (line 416) | class DistilBertModel(DistilBertPreTrainedModel):
    method __init__ (line 417) | def __init__(self, config):
    method get_input_embeddings (line 425) | def get_input_embeddings(self):
    method set_input_embeddings (line 428) | def set_input_embeddings(self, new_embeddings):
    method _prune_heads (line 431) | def _prune_heads(self, heads_to_prune):
    method forward (line 446) | def forward(
  class DistilBertForMaskedLM (line 495) | class DistilBertForMaskedLM(DistilBertPreTrainedModel):
    method __init__ (line 496) | def __init__(self, config):
    method get_output_embeddings (line 508) | def get_output_embeddings(self):
    method set_output_embeddings (line 511) | def set_output_embeddings(self, new_embeddings):
    method forward (line 521) | def forward(
  class DistilBertForSequenceClassification (line 578) | class DistilBertForSequenceClassification(DistilBertPreTrainedModel):
    method __init__ (line 579) | def __init__(self, config):
    method forward (line 597) | def forward(
  class DistilBertForQuestionAnswering (line 660) | class DistilBertForQuestionAnswering(DistilBertPreTrainedModel):
    method __init__ (line 661) | def __init__(self, config):
    method forward (line 678) | def forward(
  class DistilBertForTokenClassification (line 756) | class DistilBertForTokenClassification(DistilBertPreTrainedModel):
    method __init__ (line 757) | def __init__(self, config):
    method forward (line 774) | def forward(
  class DistilBertForMultipleChoice (line 840) | class DistilBertForMultipleChoice(DistilBertPreTrainedModel):
    method __init__ (line 841) | def __init__(self, config):
    method forward (line 855) | def forward(

FILE: patches/transformers/4.5.1/convert_graph_to_onnx.original.py
  class OnnxConverterArgumentParser (line 45) | class OnnxConverterArgumentParser(ArgumentParser):
    method __init__ (line 50) | def __init__(self):
  function generate_identified_filename (line 91) | def generate_identified_filename(filename: Path, identifier: str) -> Path:
  function check_onnxruntime_requirements (line 104) | def check_onnxruntime_requirements(minimum_version: Version):
  function ensure_valid_input (line 133) | def ensure_valid_input(model, tokens, input_names):
  function infer_shapes (line 161) | def infer_shapes(nlp: Pipeline, framework: str) -> Tuple[List[str], List...
  function load_graph_from_args (line 225) | def load_graph_from_args(
  function convert_pytorch (line 256) | def convert_pytorch(nlp: Pipeline, opset: int, output: Path, use_externa...
  function convert_tensorflow (line 295) | def convert_tensorflow(nlp: Pipeline, opset: int, output: Path):
  function convert (line 332) | def convert(
  function optimize (line 376) | def optimize(onnx_model_path: Path) -> Path:
  function quantize (line 401) | def quantize(onnx_model_path: Path) -> Path:
  function verify (line 438) | def verify(path: Path):

FILE: patches/transformers/4.5.1/convert_graph_to_onnx.py
  class OnnxConverterArgumentParser (line 48) | class OnnxConverterArgumentParser(ArgumentParser):
    method __init__ (line 53) | def __init__(self):
  function generate_identified_filename (line 99) | def generate_identified_filename(filename: Path, identifier: str) -> Path:
  function check_onnxruntime_requirements (line 112) | def check_onnxruntime_requirements(minimum_version: Version):
  function ensure_valid_input (line 141) | def ensure_valid_input(model, tokens, input_names):
  function infer_shapes (line 169) | def infer_shapes(nlp: Pipeline, framework: str) -> Tuple[List[str], List...
  function load_graph_from_args (line 233) | def load_graph_from_args(
  function convert_pytorch (line 264) | def convert_pytorch(nlp: Pipeline, opset: int, output: Path, use_externa...
  function convert_tensorflow (line 311) | def convert_tensorflow(nlp: Pipeline, opset: int, output: Path):
  function convert (line 348) | def convert(
  function optimize (line 409) | def optimize(onnx_model_path: Path) -> Path:
  function quantize (line 434) | def quantize(onnx_model_path: Path) -> Path:
  function verify (line 471) | def verify(path: Path):

FILE: patches/transformers/4.5.1/modeling_distilbert.original.py
  function create_sinusoidal_embeddings (line 74) | def create_sinusoidal_embeddings(n_pos, dim, out):
  class Embeddings (line 82) | class Embeddings(nn.Module):
    method __init__ (line 83) | def __init__(self, config):
    method forward (line 95) | def forward(self, input_ids):
  class MultiHeadSelfAttention (line 116) | class MultiHeadSelfAttention(nn.Module):
    method __init__ (line 117) | def __init__(self, config):
    method prune_heads (line 133) | def prune_heads(self, heads):
    method forward (line 148) | def forward(self, query, key, value, mask, head_mask=None, output_atte...
  class FFN (line 203) | class FFN(nn.Module):
    method __init__ (line 204) | def __init__(self, config):
    method forward (line 214) | def forward(self, input):
    method ff_chunk (line 217) | def ff_chunk(self, input):
  class TransformerBlock (line 225) | class TransformerBlock(nn.Module):
    method __init__ (line 226) | def __init__(self, config):
    method forward (line 237) | def forward(self, x, attn_mask=None, head_mask=None, output_attentions...
  class Transformer (line 273) | class Transformer(nn.Module):
    method __init__ (line 274) | def __init__(self, config):
    method forward (line 281) | def forward(
  class DistilBertPreTrainedModel (line 330) | class DistilBertPreTrainedModel(PreTrainedModel):
    method _init_weights (line 340) | def _init_weights(self, module):
  class DistilBertModel (line 416) | class DistilBertModel(DistilBertPreTrainedModel):
    method __init__ (line 417) | def __init__(self, config):
    method get_input_embeddings (line 425) | def get_input_embeddings(self):
    method set_input_embeddings (line 428) | def set_input_embeddings(self, new_embeddings):
    method _prune_heads (line 431) | def _prune_heads(self, heads_to_prune):
    method forward (line 446) | def forward(
  class DistilBertForMaskedLM (line 495) | class DistilBertForMaskedLM(DistilBertPreTrainedModel):
    method __init__ (line 496) | def __init__(self, config):
    method get_output_embeddings (line 508) | def get_output_embeddings(self):
    method set_output_embeddings (line 511) | def set_output_embeddings(self, new_embeddings):
    method forward (line 521) | def forward(
  class DistilBertForSequenceClassification (line 578) | class DistilBertForSequenceClassification(DistilBertPreTrainedModel):
    method __init__ (line 579) | def __init__(self, config):
    method forward (line 597) | def forward(
  class DistilBertForQuestionAnswering (line 660) | class DistilBertForQuestionAnswering(DistilBertPreTrainedModel):
    method __init__ (line 661) | def __init__(self, config):
    method forward (line 678) | def forward(
  class DistilBertForTokenClassification (line 756) | class DistilBertForTokenClassification(DistilBertPreTrainedModel):
    method __init__ (line 757) | def __init__(self, config):
    method forward (line 774) | def forward(
  class DistilBertForMultipleChoice (line 840) | class DistilBertForMultipleChoice(DistilBertPreTrainedModel):
    method __init__ (line 841) | def __init__(self, config):
    method forward (line 855) | def forward(

FILE: patches/transformers/4.5.1/modeling_distilbert.py
  function create_sinusoidal_embeddings (line 74) | def create_sinusoidal_embeddings(n_pos, dim, out):
  class Embeddings (line 82) | class Embeddings(nn.Module):
    method __init__ (line 83) | def __init__(self, config):
    method forward (line 95) | def forward(self, input_ids):
  class MultiHeadSelfAttention (line 116) | class MultiHeadSelfAttention(nn.Module):
    method __init__ (line 117) | def __init__(self, config):
    method prune_heads (line 133) | def prune_heads(self, heads):
    method forward (line 148) | def forward(self, query, key, value, mask, head_mask=None, output_atte...
  class FFN (line 203) | class FFN(nn.Module):
    method __init__ (line 204) | def __init__(self, config):
    method forward (line 214) | def forward(self, input):
    method ff_chunk (line 217) | def ff_chunk(self, input):
  class TransformerBlock (line 225) | class TransformerBlock(nn.Module):
    method __init__ (line 226) | def __init__(self, config):
    method forward (line 237) | def forward(self, x, attn_mask=None, head_mask=None, output_attentions...
  class Transformer (line 273) | class Transformer(nn.Module):
    method __init__ (line 274) | def __init__(self, config):
    method forward (line 281) | def forward(
  class DistilBertPreTrainedModel (line 330) | class DistilBertPreTrainedModel(PreTrainedModel):
    method _init_weights (line 340) | def _init_weights(self, module):
  class DistilBertModel (line 416) | class DistilBertModel(DistilBertPreTrainedModel):
    method __init__ (line 417) | def __init__(self, config):
    method get_input_embeddings (line 425) | def get_input_embeddings(self):
    method set_input_embeddings (line 428) | def set_input_embeddings(self, new_embeddings):
    method _prune_heads (line 431) | def _prune_heads(self, heads_to_prune):
    method forward (line 446) | def forward(
  class DistilBertForMaskedLM (line 495) | class DistilBertForMaskedLM(DistilBertPreTrainedModel):
    method __init__ (line 496) | def __init__(self, config):
    method get_output_embeddings (line 508) | def get_output_embeddings(self):
    method set_output_embeddings (line 511) | def set_output_embeddings(self, new_embeddings):
    method forward (line 521) | def forward(
  class DistilBertForSequenceClassification (line 578) | class DistilBertForSequenceClassification(DistilBertPreTrainedModel):
    method __init__ (line 579) | def __init__(self, config):
    method forward (line 597) | def forward(
  class DistilBertForQuestionAnswering (line 660) | class DistilBertForQuestionAnswering(DistilBertPreTrainedModel):
    method __init__ (line 661) | def __init__(self, config):
    method forward (line 678) | def forward(
  class DistilBertForTokenClassification (line 756) | class DistilBertForTokenClassification(DistilBertPreTrainedModel):
    method __init__ (line 757) | def __init__(self, config):
    method forward (line 774) | def forward(
  class DistilBertForMultipleChoice (line 840) | class DistilBertForMultipleChoice(DistilBertPreTrainedModel):
    method __init__ (line 841) | def __init__(self, config):
    method forward (line 855) | def forward(

FILE: ros/jetson_voice_ros/asr.py
  class ASRNode (line 13) | class ASRNode(Node):
    method __init__ (line 14) | def __init__(self):
    method audio_listener (line 34) | def audio_listener(self, msg):
  function main (line 60) | def main(args=None):

FILE: ros/jetson_voice_ros/audio_input.py
  class AudioInputNode (line 12) | class AudioInputNode(Node):
    method __init__ (line 13) | def __init__(self):
    method publish_audio (line 58) | def publish_audio(self):
  function main (line 99) | def main(args=None):

FILE: ros/jetson_voice_ros/audio_output.py
  class AudioOutputNode (line 14) | class AudioOutputNode(Node):
    method __init__ (line 15) | def __init__(self):
    method audio_listener (line 47) | def audio_listener(self, msg):
  function main (line 65) | def main(args=None):

FILE: ros/jetson_voice_ros/nlp_intent_slot.py
  class NLPIntentSlotNode (line 12) | class NLPIntentSlotNode(Node):
    method __init__ (line 13) | def __init__(self):
    method query_listener (line 29) | def query_listener(self, msg):
  function main (line 70) | def main(args=None):

FILE: ros/jetson_voice_ros/nlp_question_answer.py
  class NLPQuestionAnswerNode (line 12) | class NLPQuestionAnswerNode(Node):
    method __init__ (line 13) | def __init__(self):
    method query_listener (line 29) | def query_listener(self, msg):
  function main (line 58) | def main(args=None):

FILE: ros/jetson_voice_ros/tts.py
  class TTSNode (line 14) | class TTSNode(Node):
    method __init__ (line 15) | def __init__(self):
    method text_listener (line 31) | def text_listener(self, msg):
  function main (line 57) | def main(args=None):

FILE: ros/launch/asr.launch.py
  function generate_launch_description (line 13) | def generate_launch_description():

FILE: ros/launch/audio_playback.launch.py
  function generate_launch_description (line 13) | def generate_launch_description():

FILE: ros/launch/tts.launch.py
  function generate_launch_description (line 13) | def generate_launch_description():

FILE: scripts/record_mic.py
  function signal_handler (line 30) | def signal_handler(sig, frame):

FILE: tests/run_tests.py
  function run_test (line 32) | def run_test(module, model, config, args=None, log_dir=None):
  function filter_test (line 62) | def filter_test(test):

Download .json

Condensed preview — 115 files, each showing path, character count, and a content snippet. Download the .json file or copy for the full structured content (802K chars).

[
  {
    "path": ".dockerignore",
    "chars": 19,
    "preview": "data/\r\n.git\r\n.cache"
  },
  {
    "path": ".gitignore",
    "chars": 1821,
    "preview": "data/\nlogs/\npackages/\n# Byte-compiled / optimized / DLL files\n__pycache__/\n*.py[cod]\n*$py.class\n\n# C extensions\n*.so\n\n# "
  },
  {
    "path": ".gitmodules",
    "chars": 111,
    "preview": "[submodule \"docker/containers\"]\n\tpath = docker/containers\n\turl = https://github.com/dusty-nv/jetson-containers\n"
  },
  {
    "path": "Dockerfile.aarch64",
    "chars": 9372,
    "preview": "# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.\r\n#\r\n# Permission is hereby granted, free of charge, to an"
  },
  {
    "path": "Dockerfile.ros",
    "chars": 2249,
    "preview": "# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.\r\n#\r\n# Permission is hereby granted, free of charge, to an"
  },
  {
    "path": "Dockerfile.runtime",
    "chars": 1606,
    "preview": "# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.\r\n#\r\n# Permission is hereby granted, free of charge, to an"
  },
  {
    "path": "Dockerfile.x86_64",
    "chars": 4102,
    "preview": "# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.\r\n#\r\n# Permission is hereby granted, free of charge, to an"
  },
  {
    "path": "README.md",
    "chars": 19016,
    "preview": "# jetson-voice\r\n\r\njetson-voice is an ASR/NLP/TTS deep learning inference library for Jetson Nano, TX1/TX2, Xavier NX, an"
  },
  {
    "path": "docker/build.sh",
    "chars": 2912,
    "preview": "#!/usr/bin/env bash\n\nROS_DISTRO=${1:-\"none\"}\nBASE_IMAGE=$2\nNEMO_VERSION=\"1.0.0rc1\"\n\n# find container tag from os version"
  },
  {
    "path": "docker/push.sh",
    "chars": 407,
    "preview": "#!/usr/bin/env bash\n\nROS_DISTRO=${1:-\"foxy\"}\nsource docker/tag.sh\n\n# push image\npush() \n{\n\tlocal remote_image=\"dustynv/$"
  },
  {
    "path": "docker/run.sh",
    "chars": 6176,
    "preview": "#!/usr/bin/env bash\n#\n# Start an instance of the jetson-voice docker container.\n# See below or run this script with -h o"
  },
  {
    "path": "docker/tag.sh",
    "chars": 348,
    "preview": "#!/usr/bin/env bash\n\n# find OS version\nsource scripts/os_version.sh\n\nif [ $ARCH = \"aarch64\" ]; then\n\tTAG=\"r$L4T_VERSION\""
  },
  {
    "path": "examples/asr.py",
    "chars": 1298,
    "preview": "#!/usr/bin/env python3\n# coding: utf-8\n\nimport sys\n\nfrom jetson_voice import ASR, AudioInput, ConfigArgParser, list_audi"
  },
  {
    "path": "examples/assistant.py",
    "chars": 4081,
    "preview": "#!/usr/bin/env python3\n# coding: utf-8\n\nimport sys\nimport pprint\n\nfrom jetson_voice import (\n    ASR, NLP, TTS, \n    Aud"
  },
  {
    "path": "examples/nlp.py",
    "chars": 899,
    "preview": "#!/usr/bin/env python3\n# coding: utf-8\n\nimport sys\nimport pprint\nimport readline\n\nfrom jetson_voice import NLP, ConfigAr"
  },
  {
    "path": "examples/nlp_qa.py",
    "chars": 4135,
    "preview": "#!/usr/bin/env python3\n# coding: utf-8\n\nimport sys\nimport readline\n\nfrom jetson_voice import QuestionAnswer, ConfigArgPa"
  },
  {
    "path": "examples/tts.py",
    "chars": 2151,
    "preview": "#!/usr/bin/env python3\n# coding: utf-8\n\nimport os\nimport sys\nimport time\nimport readline\n\nfrom jetson_voice import TTS, "
  },
  {
    "path": "jetson_voice/__init__.py",
    "chars": 590,
    "preview": "#!/usr/bin/env python3\r\n# coding: utf-8\r\n\r\nfrom .utils import (\r\n    find_resource, list_models, global_config, ConfigDi"
  },
  {
    "path": "jetson_voice/asr.py",
    "chars": 3905,
    "preview": "#!/usr/bin/env python3\n# coding: utf-8\n\nfrom jetson_voice.utils import load_resource\n\n\ndef ASR(resource, *args, **kwargs"
  },
  {
    "path": "jetson_voice/auto.py",
    "chars": 1651,
    "preview": "#!/usr/bin/env python3\n# coding: utf-8\n\nfrom jetson_voice.asr import ASR\nfrom jetson_voice.nlp import IntentSlot, Questi"
  },
  {
    "path": "jetson_voice/backends/onnxruntime/__init__.py",
    "chars": 86,
    "preview": "#!/usr/bin/env python3\r\n# coding: utf-8\r\n\r\nfrom .ort_model import OnnxRuntimeModel\r\n\r\n"
  },
  {
    "path": "jetson_voice/backends/onnxruntime/ort_model.py",
    "chars": 3170,
    "preview": "#!/usr/bin/env python3\n# coding: utf-8\n\nimport os\nimport logging\n\n# for some reason if PyCUDA isn't initialized before O"
  },
  {
    "path": "jetson_voice/backends/riva/__init__.py",
    "chars": 119,
    "preview": "#!/usr/bin/env python3\r\n# coding: utf-8\r\n\r\nfrom .riva_asr import RivaASRService\r\nfrom .riva_tts import RivaTTSService\r\n"
  },
  {
    "path": "jetson_voice/backends/riva/riva_asr.py",
    "chars": 5990,
    "preview": "#!/usr/bin/env python3\r\n# coding: utf-8\r\n\r\nimport os\r\nimport grpc\r\nimport queue\r\nimport threading\r\nimport logging\r\n\r\nimp"
  },
  {
    "path": "jetson_voice/backends/riva/riva_tts.py",
    "chars": 2003,
    "preview": "#!/usr/bin/env python3\r\n# coding: utf-8\r\n\r\nimport os\r\nimport grpc\r\nimport logging\r\nimport numpy as np\r\n\r\nimport riva_api"
  },
  {
    "path": "jetson_voice/backends/tensorrt/__init__.py",
    "chars": 78,
    "preview": "#!/usr/bin/env python3\r\n# coding: utf-8\r\n\r\nfrom .trt_model import TRTModel\r\n\r\n"
  },
  {
    "path": "jetson_voice/backends/tensorrt/trt_binding.py",
    "chars": 3921,
    "preview": "#!/usr/bin/env python3\n# coding: utf-8\n\nimport logging\nimport tensorrt as trt\n\nimport pycuda.driver as cuda\nimport pycud"
  },
  {
    "path": "jetson_voice/backends/tensorrt/trt_builder.py",
    "chars": 8420,
    "preview": "#!/usr/bin/env python3\n# coding: utf-8\n\nimport os\nimport time\nimport json\nimport logging\nimport tensorrt as trt\n\nimport "
  },
  {
    "path": "jetson_voice/backends/tensorrt/trt_model.py",
    "chars": 6900,
    "preview": "#!/usr/bin/env python3\n# coding: utf-8\n\nimport os\nimport time\nimport json\nimport logging\nimport pprint\n\nimport numpy as "
  },
  {
    "path": "jetson_voice/models/__init__.py",
    "chars": 207,
    "preview": "#!/usr/bin/env python3\r\n# coding: utf-8\r\n\r\nfrom .asr import ASREngine\r\nfrom .nlp import IntentSlotEngine, QuestionAnswer"
  },
  {
    "path": "jetson_voice/models/asr/__init__.py",
    "chars": 78,
    "preview": "#!/usr/bin/env python3\r\n# coding: utf-8\r\n\r\nfrom .asr_engine import ASREngine\r\n"
  },
  {
    "path": "jetson_voice/models/asr/asr_engine.py",
    "chars": 10765,
    "preview": "#!/usr/bin/env python3\n# coding: utf-8\n\nimport os\nimport time\nimport pprint\nimport logging\nimport importlib\n\nimport torc"
  },
  {
    "path": "jetson_voice/models/asr/ctc_beamsearch.py",
    "chars": 6036,
    "preview": "#!/usr/bin/env python3\n# coding: utf-8\n\nimport os\nimport logging\n\nfrom .ctc_decoder import CTCDecoder\nfrom .ctc_utils im"
  },
  {
    "path": "jetson_voice/models/asr/ctc_decoder.py",
    "chars": 3053,
    "preview": "#!/usr/bin/env python3\n# coding: utf-8\n\n        \nclass CTCDecoder:\n    \"\"\"\n    CTC decoder base class for ASR.\n    \"\"\"  "
  },
  {
    "path": "jetson_voice/models/asr/ctc_greedy.py",
    "chars": 5274,
    "preview": "#!/usr/bin/env python3\n# coding: utf-8\n\nimport string\nimport numpy as np\n\nfrom .ctc_decoder import CTCDecoder\nfrom .ctc_"
  },
  {
    "path": "jetson_voice/models/asr/ctc_utils.py",
    "chars": 6803,
    "preview": "#!/usr/bin/env python3\n# coding: utf-8\n\nimport nltk\nimport numpy as np\n\nfrom jetson_voice.utils import global_config\n\n\nd"
  },
  {
    "path": "jetson_voice/models/nlp/__init__.py",
    "chars": 255,
    "preview": "#!/usr/bin/env python3\r\n# coding: utf-8\r\n\r\nfrom .intent_slot import IntentSlotEngine\r\nfrom .question_answer import Quest"
  },
  {
    "path": "jetson_voice/models/nlp/intent_slot.py",
    "chars": 5974,
    "preview": "#!/usr/bin/env python3\n# coding: utf-8\n\nimport os\nimport logging\nimport numpy as np\n\nfrom transformers import AutoTokeni"
  },
  {
    "path": "jetson_voice/models/nlp/nlp_utils.py",
    "chars": 2386,
    "preview": "#!/usr/bin/env python3\n# coding: utf-8\n\nimport numpy as np\n\n\n# NLP BERT models (and BERT derivatives) have myelin proble"
  },
  {
    "path": "jetson_voice/models/nlp/question_answer.py",
    "chars": 10739,
    "preview": "#!/usr/bin/env python3\n# coding: utf-8\n\nimport os\nimport logging\nimport numpy as np\n\nfrom transformers import AutoTokeni"
  },
  {
    "path": "jetson_voice/models/nlp/text_classification.py",
    "chars": 3173,
    "preview": "#!/usr/bin/env python3\n# coding: utf-8\n\nimport os\nimport logging\nimport numpy as np\n\nfrom transformers import AutoTokeni"
  },
  {
    "path": "jetson_voice/models/nlp/token_classification.py",
    "chars": 5336,
    "preview": "#!/usr/bin/env python3\n# coding: utf-8\n\nimport os\nimport logging\nimport numpy as np\n\nfrom transformers import AutoTokeni"
  },
  {
    "path": "jetson_voice/models/tts/__init__.py",
    "chars": 78,
    "preview": "#!/usr/bin/env python3\r\n# coding: utf-8\r\n\r\nfrom .tts_engine import TTSEngine\r\n"
  },
  {
    "path": "jetson_voice/models/tts/tts_engine.py",
    "chars": 4908,
    "preview": "#!/usr/bin/env python3\n# coding: utf-8\n\nimport os\nimport re\nimport logging\nimport inflect\n\nimport numpy as np\n\nfrom jets"
  },
  {
    "path": "jetson_voice/nlp.py",
    "chars": 12936,
    "preview": "#!/usr/bin/env python3\n# coding: utf-8\n\nfrom jetson_voice.utils import load_resource\n\n\ndef NLP(resource, *args, **kwargs"
  },
  {
    "path": "jetson_voice/tts.py",
    "chars": 3133,
    "preview": "#!/usr/bin/env python3\n# coding: utf-8\n\nfrom jetson_voice.utils import load_resource\n\n\ndef TTS(resource, *args, **kwargs"
  },
  {
    "path": "jetson_voice/utils/__init__.py",
    "chars": 254,
    "preview": "#!/usr/bin/env python3\r\n# coding: utf-8\r\n\r\nfrom .config import global_config, ConfigDict, ConfigArgParser\r\nfrom .resourc"
  },
  {
    "path": "jetson_voice/utils/audio.py",
    "chars": 12679,
    "preview": "#!/usr/bin/env python3\n# coding: utf-8\n\nimport os\nimport math\nimport pprint\nimport logging\nimport librosa\nimport soundfi"
  },
  {
    "path": "jetson_voice/utils/config.py",
    "chars": 8140,
    "preview": "#!/usr/bin/env python3\r\n# coding: utf-8\r\n\r\nimport os\r\nimport json\r\nimport pprint\r\nimport logging\r\nimport argparse\r\n\r\n\r\n#"
  },
  {
    "path": "jetson_voice/utils/resource.py",
    "chars": 8947,
    "preview": "#!/usr/bin/env python3\r\n# coding: utf-8\r\n\r\nimport os\r\nimport json\r\nimport time\r\nimport tqdm\r\nimport pprint\r\nimport loggi"
  },
  {
    "path": "jetson_voice/utils/softmax.py",
    "chars": 1418,
    "preview": "#!/usr/bin/env python3\n# coding: utf-8\n\nimport numpy as np\n\n    \ndef softmax(x, theta=1.0, axis=None):\n    \"\"\"\n    Compu"
  },
  {
    "path": "patches/nemo/1.0.0rc1/exportable.original.py",
    "chars": 11923,
    "preview": "# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the "
  },
  {
    "path": "patches/nemo/1.0.0rc1/exportable.py",
    "chars": 12995,
    "preview": "# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the "
  },
  {
    "path": "patches/nemo/1.0.0rc1/nlp/__init__.py",
    "chars": 1189,
    "preview": "# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the "
  },
  {
    "path": "patches/nemo/1.0.0rc1/nlp/distilbert.diff",
    "chars": 1257,
    "preview": "17a18\n> from typing import Dict, Optional\n19a21\n> from nemo.core.neural_types import ChannelType, MaskType, NeuralType\n2"
  },
  {
    "path": "patches/nemo/1.0.0rc1/nlp/distilbert.original.py",
    "chars": 1332,
    "preview": "# Copyright 2020 The Google AI Language Team Authors and\n# The HuggingFace Inc. team.\n# Copyright (c) 2020, NVIDIA CORPO"
  },
  {
    "path": "patches/nemo/1.0.0rc1/nlp/distilbert.py",
    "chars": 2483,
    "preview": "# Copyright 2020 The Google AI Language Team Authors and\n# The HuggingFace Inc. team.\n# Copyright (c) 2020, NVIDIA CORPO"
  },
  {
    "path": "patches/nemo/1.0.0rc1/nlp/huggingface_utils.py",
    "chars": 5116,
    "preview": "# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the "
  },
  {
    "path": "patches/nemo/1.0.0rc1/nlp/location.txt",
    "chars": 111,
    "preview": "nemo/collections/nlp/modules/common/huggingface\r\n\r\nMain branch. Commit 21a17b267fac68d4cdd20f3969a580a0a40dbdb4"
  },
  {
    "path": "patches/nemo/1.0.0rc1/nlp/mobilebert.py",
    "chars": 1272,
    "preview": "# Copyright 2018 The Google AI Language Team Authors and\n# The HuggingFace Inc. team.\n# Copyright (c) 2020, NVIDIA CORPO"
  },
  {
    "path": "patches/nemo/1.0.0rc1/setup.original.py",
    "chars": 9832,
    "preview": "# ! /usr/bin/python\n# -*- coding: utf-8 -*-\n\n# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.\n#\n# License"
  },
  {
    "path": "patches/nemo/1.0.0rc1/setup.py",
    "chars": 9983,
    "preview": "# ! /usr/bin/python\n# -*- coding: utf-8 -*-\n\n# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.\n#\n# License"
  },
  {
    "path": "patches/nemo/1.6.2/requirements.original.txt",
    "chars": 142,
    "preview": "numpy>=1.21\nonnx>=1.7.0\npython-dateutil\ntorch\nwrapt\nruamel.yaml\nscikit-learn\nsentencepiece<1.0.0\ntqdm>=4.41.0\nnumba\nwget"
  },
  {
    "path": "patches/nemo/1.6.2/requirements.txt",
    "chars": 136,
    "preview": "numpy\nonnx>=1.7.0\npython-dateutil\ntorch\nwrapt\nruamel.yaml\nscikit-learn\nsentencepiece<1.0.0\ntqdm>=4.41.0\nnumba\nwget\nfroze"
  },
  {
    "path": "patches/nemo/1.6.2/requirements_nlp.original.txt",
    "chars": 171,
    "preview": "boto3\nh5py\nmatplotlib>=3.3.2\nsentencepiece\nyoutokentome>=1.0.5\nnumpy\nrapidfuzz\ngdown\ninflect\nsacrebleu[ja]\nsacremoses>=0"
  },
  {
    "path": "patches/nemo/1.6.2/requirements_nlp.txt",
    "chars": 140,
    "preview": "boto3\nh5py\nmatplotlib\nsentencepiece\nyoutokentome>=1.0.5\nnumpy\ngdown\ninflect\nsacremoses>=0.0.43\nnltk>=3.6.5\nfasttext\nopen"
  },
  {
    "path": "patches/pytorch/1.6.0/functional.diff",
    "chars": 835,
    "preview": "2a3,5\n> import librosa  # STFT patch for aarch64\n> import numpy as np\n> \n465c468,478\n<     return _VF.stft(input, n_fft,"
  },
  {
    "path": "patches/pytorch/1.6.0/functional.original.py",
    "chars": 55443,
    "preview": "from typing import Tuple, Optional\n\nimport torch\nimport torch.nn.functional as F\nfrom ._lowrank import svd_lowrank, pca_"
  },
  {
    "path": "patches/pytorch/1.6.0/functional.py",
    "chars": 56050,
    "preview": "from typing import Tuple, Optional\n\nimport librosa  # STFT patch for aarch64\nimport numpy as np\n\nimport torch\nimport tor"
  },
  {
    "path": "patches/pytorch/1.7.0/functional.diff",
    "chars": 893,
    "preview": "4a5,7\n> import librosa  # STFT patch for aarch64\n> import numpy as np\n> \n515,516c518,528\n<     return _VF.stft(input, n_"
  },
  {
    "path": "patches/pytorch/1.7.0/functional.original.py",
    "chars": 63373,
    "preview": "from typing import (\n    Tuple, Optional, Union, Any, Sequence, TYPE_CHECKING\n)\n\nimport torch\nimport torch.nn.functional"
  },
  {
    "path": "patches/pytorch/1.7.0/functional.py",
    "chars": 63928,
    "preview": "from typing import (\n    Tuple, Optional, Union, Any, Sequence, TYPE_CHECKING\n)\n\nimport librosa  # STFT patch for aarch6"
  },
  {
    "path": "patches/transformers/4.5.0/convert_graph_to_onnx.diff",
    "chars": 1505,
    "preview": "14a15,17\n> import os \n> import json\n> \n83a87,91\n>             \"--save-config\",\n>             action=\"store_true\",\n>     "
  },
  {
    "path": "patches/transformers/4.5.0/convert_graph_to_onnx.original.py",
    "chars": 18640,
    "preview": "# Copyright 2020 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
  },
  {
    "path": "patches/transformers/4.5.0/convert_graph_to_onnx.py",
    "chars": 19675,
    "preview": "# Copyright 2020 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
  },
  {
    "path": "patches/transformers/4.5.0/modeling_distilbert.py",
    "chars": 38501,
    "preview": "# coding=utf-8\n# Copyright 2019-present, the HuggingFace Inc. team, The Google AI Language Team and Facebook, Inc.\n#\n# L"
  },
  {
    "path": "patches/transformers/4.5.1/convert_graph_to_onnx.diff",
    "chars": 1505,
    "preview": "14a15,17\n> import os \n> import json\n> \n83a87,91\n>             \"--save-config\",\n>             action=\"store_true\",\n>     "
  },
  {
    "path": "patches/transformers/4.5.1/convert_graph_to_onnx.original.py",
    "chars": 18640,
    "preview": "# Copyright 2020 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
  },
  {
    "path": "patches/transformers/4.5.1/convert_graph_to_onnx.py",
    "chars": 19675,
    "preview": "# Copyright 2020 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
  },
  {
    "path": "patches/transformers/4.5.1/modeling_distilbert.diff",
    "chars": 398,
    "preview": "183,184c183,184\n<         mask = (mask == 0).view(mask_reshp).expand_as(scores)  # (bs, n_heads, q_length, k_length)\n<  "
  },
  {
    "path": "patches/transformers/4.5.1/modeling_distilbert.original.py",
    "chars": 38501,
    "preview": "# coding=utf-8\n# Copyright 2019-present, the HuggingFace Inc. team, The Google AI Language Team and Facebook, Inc.\n#\n# L"
  },
  {
    "path": "patches/transformers/4.5.1/modeling_distilbert.py",
    "chars": 38501,
    "preview": "# coding=utf-8\n# Copyright 2019-present, the HuggingFace Inc. team, The Google AI Language Team and Facebook, Inc.\n#\n# L"
  },
  {
    "path": "ros/CMakeLists.txt",
    "chars": 1564,
    "preview": "cmake_minimum_required(VERSION 3.5)\nproject(jetson_voice_ros)\n\n# Default to C99\nif(NOT CMAKE_C_STANDARD)\n  set(CMAKE_C_S"
  },
  {
    "path": "ros/jetson_voice_ros/__init__.py",
    "chars": 0,
    "preview": ""
  },
  {
    "path": "ros/jetson_voice_ros/asr.py",
    "chars": 2269,
    "preview": "#!/usr/bin/env python3\nimport os\nimport rclpy\nimport numpy as np\n\nfrom rclpy.node import Node\nfrom std_msgs.msg import S"
  },
  {
    "path": "ros/jetson_voice_ros/audio_input.py",
    "chars": 4037,
    "preview": "#!/usr/bin/env python3\nimport os\nimport rclpy\nimport numpy as np\n\nfrom rclpy.node import Node\n\nfrom jetson_voice.utils i"
  },
  {
    "path": "ros/jetson_voice_ros/audio_output.py",
    "chars": 2757,
    "preview": "#!/usr/bin/env python3\nimport os\nimport rclpy\nimport numpy as np\n\nfrom rclpy.node import Node\n\nfrom jetson_voice import "
  },
  {
    "path": "ros/jetson_voice_ros/nlp_intent_slot.py",
    "chars": 2264,
    "preview": "#!/usr/bin/env python3\nimport os\nimport rclpy\n\nfrom rclpy.node import Node\nfrom std_msgs.msg import String\n\nfrom jetson_"
  },
  {
    "path": "ros/jetson_voice_ros/nlp_question_answer.py",
    "chars": 2123,
    "preview": "#!/usr/bin/env python3\nimport os\nimport rclpy\n\nfrom rclpy.node import Node\nfrom std_msgs.msg import String\n\nfrom jetson_"
  },
  {
    "path": "ros/jetson_voice_ros/tts.py",
    "chars": 1776,
    "preview": "#!/usr/bin/env python3\nimport os\nimport rclpy\nimport numpy as np\n\nfrom rclpy.node import Node\nfrom std_msgs.msg import S"
  },
  {
    "path": "ros/launch/asr.launch.py",
    "chars": 1635,
    "preview": "#\r\n# Launch file for playback of an audio stream or wav file.\r\n#\r\nimport os\r\n\r\nfrom launch import LaunchDescription\r\nfro"
  },
  {
    "path": "ros/launch/audio_playback.launch.py",
    "chars": 1849,
    "preview": "#\r\n# Launch file for playback of an audio stream or wav file.\r\n#\r\nimport os\r\n\r\nfrom launch import LaunchDescription\r\nfro"
  },
  {
    "path": "ros/launch/tts.launch.py",
    "chars": 1833,
    "preview": "#\r\n# Launch file for playback of an audio stream or wav file.\r\n#\r\nimport os\r\n\r\nfrom launch import LaunchDescription\r\nfro"
  },
  {
    "path": "ros/msg/Audio.msg",
    "chars": 52,
    "preview": "std_msgs/Header header\r\nAudioInfo info\r\nuint8[] data"
  },
  {
    "path": "ros/msg/AudioInfo.msg",
    "chars": 205,
    "preview": "# Number of channels\r\nuint8 channels\r\n\r\n# Sampling rate [Hz]\r\nuint32 sample_rate\r\n\r\n# Audio format (e.g. int16, float32)"
  },
  {
    "path": "ros/msg/IntentSlot.msg",
    "chars": 212,
    "preview": "# the original query text\r\nstd_msgs/String query\r\n\r\n# the classified intent label\r\nstd_msgs/String intent\r\n\r\n# the inten"
  },
  {
    "path": "ros/msg/QuestionAnswerQuery.msg",
    "chars": 104,
    "preview": "# the question being asked\r\nstd_msgs/String question\r\n\r\n# the context paragraph\r\nstd_msgs/String context"
  },
  {
    "path": "ros/msg/QuestionAnswerResult.msg",
    "chars": 175,
    "preview": "# the question that was asked\r\nstd_msgs/String question\r\n\r\n# the answer to the question\r\nstd_msgs/String answer\r\n\r\n# the"
  },
  {
    "path": "ros/msg/Slot.msg",
    "chars": 174,
    "preview": "# the slot class label\r\nstd_msgs/String slot\r\n\r\n# the relevant text from the original query\r\nstd_msgs/String text\r\n\r\n# c"
  },
  {
    "path": "ros/package.xml",
    "chars": 884,
    "preview": "<?xml version=\"1.0\"?>\n<?xml-model href=\"http://download.ros.org/schema/package_format3.xsd\" schematypens=\"http://www.w3."
  },
  {
    "path": "scripts/list_audio_devices.py",
    "chars": 118,
    "preview": "#!/usr/bin/env python3\n# coding: utf-8\n\nfrom jetson_voice import list_audio_devices\n    \nlist_audio_devices()\n   \n    "
  },
  {
    "path": "scripts/list_models.py",
    "chars": 104,
    "preview": "#!/usr/bin/env python3\n# coding: utf-8\n\nfrom jetson_voice import list_models\n    \nlist_models()\n   \n    "
  },
  {
    "path": "scripts/nemo_export_onnx.py",
    "chars": 2403,
    "preview": "#!/usr/bin/env python3\n# coding: utf-8\n\nimport os\nimport argparse\nimport pprint\nimport json\n\nimport nemo\nimport nemo.col"
  },
  {
    "path": "scripts/nemo_list_models.py",
    "chars": 1791,
    "preview": "#!/usr/bin/env python3\r\n# coding: utf-8\r\n\r\nimport nemo\r\nimport nemo.collections.asr as nemo_asr\r\nimport nemo.collections"
  },
  {
    "path": "scripts/nemo_train_classifier.py",
    "chars": 4151,
    "preview": "#!/usr/bin/env python3\r\n# coding: utf-8\r\n\r\nimport os\r\nimport argparse\r\nimport torch\r\nimport pytorch_lightning as pl\r\n\r\nf"
  },
  {
    "path": "scripts/nemo_train_intent.py",
    "chars": 3861,
    "preview": "#!/usr/bin/env python3\n# coding: utf-8\n\nimport os\nimport argparse\nimport torch\nimport pytorch_lightning as pl\n\nfrom omeg"
  },
  {
    "path": "scripts/nemo_train_ner.py",
    "chars": 3253,
    "preview": "#!/usr/bin/env python3\r\n# coding: utf-8\r\n\r\nimport os\r\nimport argparse\r\nimport torch\r\nimport pytorch_lightning as pl\r\n\r\nf"
  },
  {
    "path": "scripts/nemo_train_qa.py",
    "chars": 3637,
    "preview": "#!/usr/bin/env python3\r\n# coding: utf-8\r\n\r\nimport os\r\nimport argparse\r\nimport torch\r\nimport pytorch_lightning as pl\r\n\r\nf"
  },
  {
    "path": "scripts/os_version.sh",
    "chars": 956,
    "preview": "#!/usr/bin/env bash\n\nARCH=$(uname -i)\necho \"ARCH:  $ARCH\"\n\nif [ $ARCH = \"aarch64\" ]; then\n\tL4T_VERSION_STRING=$(head -n "
  },
  {
    "path": "scripts/record_mic.py",
    "chars": 1452,
    "preview": "#!/usr/bin/env python3\n# coding: utf-8\n\nimport sys\nimport signal\nimport argparse\n\nfrom jetson_voice import AudioInput, l"
  },
  {
    "path": "scripts/start_jupyter.sh",
    "chars": 285,
    "preview": "#!/usr/bin/env bash\n\njupyter lab --ip 0.0.0.0 --port 8888 --allow-root &> /var/log/jupyter.log\n\necho \"allow 10 sec for J"
  },
  {
    "path": "tests/run_tests.py",
    "chars": 3514,
    "preview": "#!/usr/bin/env python3\n# coding: utf-8\n\nimport os\nimport sys\nimport json\nimport logging\nimport argparse\nimport datetime\n"
  },
  {
    "path": "tests/test_asr.py",
    "chars": 4011,
    "preview": "#!/usr/bin/env python3\n# coding: utf-8\n\nimport os\nimport sys\nimport json\nimport nltk\nimport logging\n\nfrom jetson_voice i"
  },
  {
    "path": "tests/test_nlp.py",
    "chars": 5394,
    "preview": "#!/usr/bin/env python3\n# coding: utf-8\n\nimport os\nimport sys\nimport json\nimport nltk\nimport pprint\nimport logging\n\nfrom "
  },
  {
    "path": "tests/test_tts.py",
    "chars": 3917,
    "preview": "#!/usr/bin/env python3\n# coding: utf-8\n\nimport os\nimport sys\nimport json\nimport librosa\nimport logging\nimport datetime\n\n"
  }
]

About this extraction

This page contains the full source code of the dusty-nv/jetson-voice GitHub repository, extracted and formatted as plain text for AI agents and large language models (LLMs). The extraction includes 115 files (749.0 KB), approximately 187.9k tokens, and a symbol index with 589 extracted functions, classes, methods, constants, and types. Use this with OpenClaw, Claude, ChatGPT, Cursor, Windsurf, or any other AI tool that accepts text input. You can copy the full output to your clipboard or download it as a .txt file.

Extracted by GitExtract — free GitHub repo to text converter for AI. Built by Nikandr Surkov.

Extract another repo