Repository: dusty-nv/jetson-voice
Branch: master
Commit: c6a8c9552c70
Files: 115
Total size: 749.0 KB

Directory structure:
gitextract_8xzz9c2n/

├── .dockerignore
├── .gitignore
├── .gitmodules
├── Dockerfile.aarch64
├── Dockerfile.ros
├── Dockerfile.runtime
├── Dockerfile.x86_64
├── README.md
├── docker/
│   ├── build.sh
│   ├── push.sh
│   ├── run.sh
│   └── tag.sh
├── examples/
│   ├── asr.py
│   ├── assistant.py
│   ├── nlp.py
│   ├── nlp_qa.py
│   └── tts.py
├── jetson_voice/
│   ├── __init__.py
│   ├── asr.py
│   ├── auto.py
│   ├── backends/
│   │   ├── onnxruntime/
│   │   │   ├── __init__.py
│   │   │   └── ort_model.py
│   │   ├── riva/
│   │   │   ├── __init__.py
│   │   │   ├── riva_asr.py
│   │   │   └── riva_tts.py
│   │   └── tensorrt/
│   │       ├── __init__.py
│   │       ├── trt_binding.py
│   │       ├── trt_builder.py
│   │       └── trt_model.py
│   ├── models/
│   │   ├── __init__.py
│   │   ├── asr/
│   │   │   ├── __init__.py
│   │   │   ├── asr_engine.py
│   │   │   ├── ctc_beamsearch.py
│   │   │   ├── ctc_decoder.py
│   │   │   ├── ctc_greedy.py
│   │   │   └── ctc_utils.py
│   │   ├── nlp/
│   │   │   ├── __init__.py
│   │   │   ├── intent_slot.py
│   │   │   ├── nlp_utils.py
│   │   │   ├── question_answer.py
│   │   │   ├── text_classification.py
│   │   │   └── token_classification.py
│   │   └── tts/
│   │       ├── __init__.py
│   │       └── tts_engine.py
│   ├── nlp.py
│   ├── tts.py
│   └── utils/
│       ├── __init__.py
│       ├── audio.py
│       ├── config.py
│       ├── resource.py
│       └── softmax.py
├── patches/
│   ├── nemo/
│   │   ├── 1.0.0rc1/
│   │   │   ├── exportable.original.py
│   │   │   ├── exportable.py
│   │   │   ├── nlp/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── distilbert.diff
│   │   │   │   ├── distilbert.original.py
│   │   │   │   ├── distilbert.py
│   │   │   │   ├── huggingface_utils.py
│   │   │   │   ├── location.txt
│   │   │   │   └── mobilebert.py
│   │   │   ├── setup.original.py
│   │   │   └── setup.py
│   │   └── 1.6.2/
│   │       ├── requirements.original.txt
│   │       ├── requirements.txt
│   │       ├── requirements_nlp.original.txt
│   │       └── requirements_nlp.txt
│   ├── pytorch/
│   │   ├── 1.6.0/
│   │   │   ├── functional.diff
│   │   │   ├── functional.original.py
│   │   │   └── functional.py
│   │   └── 1.7.0/
│   │       ├── functional.diff
│   │       ├── functional.original.py
│   │       └── functional.py
│   └── transformers/
│       ├── 4.5.0/
│       │   ├── convert_graph_to_onnx.diff
│       │   ├── convert_graph_to_onnx.original.py
│       │   ├── convert_graph_to_onnx.py
│       │   └── modeling_distilbert.py
│       └── 4.5.1/
│           ├── convert_graph_to_onnx.diff
│           ├── convert_graph_to_onnx.original.py
│           ├── convert_graph_to_onnx.py
│           ├── modeling_distilbert.diff
│           ├── modeling_distilbert.original.py
│           └── modeling_distilbert.py
├── ros/
│   ├── CMakeLists.txt
│   ├── jetson_voice_ros/
│   │   ├── __init__.py
│   │   ├── asr.py
│   │   ├── audio_input.py
│   │   ├── audio_output.py
│   │   ├── nlp_intent_slot.py
│   │   ├── nlp_question_answer.py
│   │   └── tts.py
│   ├── launch/
│   │   ├── asr.launch.py
│   │   ├── audio_playback.launch.py
│   │   └── tts.launch.py
│   ├── msg/
│   │   ├── Audio.msg
│   │   ├── AudioInfo.msg
│   │   ├── IntentSlot.msg
│   │   ├── QuestionAnswerQuery.msg
│   │   ├── QuestionAnswerResult.msg
│   │   └── Slot.msg
│   └── package.xml
├── scripts/
│   ├── list_audio_devices.py
│   ├── list_models.py
│   ├── nemo_export_onnx.py
│   ├── nemo_list_models.py
│   ├── nemo_train_classifier.py
│   ├── nemo_train_intent.py
│   ├── nemo_train_ner.py
│   ├── nemo_train_qa.py
│   ├── os_version.sh
│   ├── record_mic.py
│   └── start_jupyter.sh
└── tests/
    ├── run_tests.py
    ├── test_asr.py
    ├── test_nlp.py
    └── test_tts.py

================================================
FILE CONTENTS
================================================

================================================
FILE: .dockerignore
================================================
data/
.git
.cache

================================================
FILE: .gitignore
================================================
data/
logs/
packages/
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class

# C extensions
*.so

# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
pip-wheel-metadata/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST

# PyInstaller
#  Usually these files are written by a python script from a template
#  before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec

# Installer logs
pip-log.txt
pip-delete-this-directory.txt

# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/

# Translations
*.mo
*.pot

# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal

# Flask stuff:
instance/
.webassets-cache

# Scrapy stuff:
.scrapy

# Sphinx documentation
docs/_build/

# PyBuilder
target/

# Jupyter Notebook
.ipynb_checkpoints

# IPython
profile_default/
ipython_config.py

# pyenv
.python-version

# pipenv
#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
#   However, in case of collaboration, if having platform-specific dependencies or dependencies
#   having no cross-platform support, pipenv may install dependencies that don't work, or not
#   install all needed dependencies.
#Pipfile.lock

# PEP 582; used by e.g. github.com/David-OConnor/pyflow
__pypackages__/

# Celery stuff
celerybeat-schedule
celerybeat.pid

# SageMath parsed files
*.sage.py

# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/

# Spyder project settings
.spyderproject
.spyproject

# Rope project settings
.ropeproject

# mkdocs documentation
/site

# mypy
.mypy_cache/
.dmypy.json
dmypy.json

# Pyre type checker
.pyre/


================================================
FILE: .gitmodules
================================================
[submodule "docker/containers"]
	path = docker/containers
	url = https://github.com/dusty-nv/jetson-containers


================================================
FILE: Dockerfile.aarch64
================================================
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a
# copy of this software and associated documentation files (the "Software"),
# to deal in the Software without restriction, including without limitation
# the rights to use, copy, modify, merge, publish, distribute, sublicense,
# and/or sell copies of the Software, and to permit persons to whom the
# Software is furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
# DEALINGS IN THE SOFTWARE.

ARG BASE_IMAGE
FROM ${BASE_IMAGE}

ENV DEBIAN_FRONTEND=noninteractive
ENV SHELL /bin/bash
ENV LANG='en_US.UTF-8' LANGUAGE='en_US:en' LC_ALL='en_US.UTF-8'
ARG MAKEFLAGS=-j$(nproc)
ARG WORKSPACE=/jetson-voice

WORKDIR ${WORKSPACE}

# alias python3 -> python
RUN rm /usr/bin/python && \
    ln -s /usr/bin/python3 /usr/bin/python && \
    ln -s /usr/bin/pip3 /usr/bin/pip


################################################################
## tokenizers/transformers
################################################################
RUN apt-get update && \
    apt-get install -y --no-install-recommends \
            cmake \
		  curl \
		  pkg-config \
		  protobuf-compiler \
		  libprotoc-dev \
		  nano \
		  tzdata \
		  libssl-dev \
    && rm -rf /var/lib/apt/lists/* \
    && apt-get clean
    
# install sentencepiece
RUN git clone https://github.com/google/sentencepiece && \
	cd sentencepiece && \
	mkdir build && \
	cd build && \
	cmake .. && \
	make -j $(nproc) && \
	make install && \
	ldconfig -v && \
	cd .. && \
	cd python && \
	python3 setup.py install --verbose && \
	cd ../../ && \
	rm -r -f sentencepiece

# install rust (used by tokenizers)
RUN curl https://sh.rustup.rs -sSf | sh -s -- -y
ENV PATH="/root/.cargo/bin:${PATH}"
RUN rustc --version && \
    pip3 install setuptools-rust

# install tokenizers
RUN pip3 install tokenizers --verbose

# Apache arrow is needed by datasets package ('pip install pyarrow' is broken, so built from source)
#  https://github.com/apache/arrow/blob/master/docs/source/developers/python.rst#using-pip
#  https://raspberrypi.stackexchange.com/a/117723
RUN apt-get update && \
    apt-get install -y --no-install-recommends \
			libjemalloc-dev \
			libboost-dev \
			libboost-filesystem-dev \
			libboost-system-dev \
			libboost-regex-dev \
			autoconf \
			flex \
			bison \
    && rm -rf /var/lib/apt/lists/* \
    && apt-get clean 

RUN git clone --branch apache-arrow-3.0.0 https://github.com/apache/arrow.git && \
	cd arrow/cpp && \
	mkdir build && \
	cd build && \
	export ARROW_HOME=/usr/local && \
	cmake \
		-DCMAKE_INSTALL_PREFIX=$ARROW_HOME \
		-DCMAKE_INSTALL_LIBDIR=lib \
		-DARROW_WITH_BZ2=ON \
		-DARROW_WITH_ZLIB=ON \
		-DARROW_WITH_ZSTD=ON \
		-DARROW_WITH_LZ4=ON \
		-DARROW_WITH_SNAPPY=ON \
		-DARROW_PARQUET=ON \
		-DARROW_CUDA=ON \
		-DARROW_PYTHON=ON \
		-DARROW_BUILD_TESTS=OFF \
		.. && \
	make -j$(nproc) && \
	make install && \
	cd ../../python && \
	python3 setup.py build_ext --build-type=release --with-parquet --with-cuda --verbose && \
	python3 setup.py install --verbose && \
	cd ../../ && \
	rm -r -f arrow

RUN pip3 show pyarrow && \
	python3 -c "import pyarrow" && \
	python3 -c "from pyarrow import cuda"
	
# install huggingface (locked to 4.5.1, which the patches are based on)
# datasets package is needed to run the huggingface examples
RUN pip3 install transformers==4.5.1 datasets --verbose
  

################################################################
## onnx / onnxruntime / onnx-graphsurgeon
################################################################
ARG ONNXRUNTIME_URL=https://nvidia.box.com/shared/static/ukszbm1iklzymrt54mgxbzjfzunq7i9t.whl
ARG ONNXRUNTIME_WHL=onnxruntime_gpu-1.7.0-cp36-cp36m-linux_aarch64.whl

RUN wget --quiet --show-progress --progress=bar:force:noscroll --no-check-certificate ${ONNXRUNTIME_URL} -O ${ONNXRUNTIME_WHL} && \
    pip3 install ${ONNXRUNTIME_WHL} --verbose && \
    pip3 install onnx psutil sympy --verbose && \
    rm ${ONNXRUNTIME_WHL}

# install onnx-graphsurgeon
RUN cd /opt && \
    git clone --recursive https://github.com/nvidia/tensorrt tensorrt && \
    cd tensorrt/tools/onnx-graphsurgeon && \
    python3 setup.py install --verbose && \
    cd ../../../ && \
    rm -r -f tensorrt
    
    
################################################################
## NeMo
################################################################
RUN apt-get update && \
    apt-get install -y --no-install-recommends \
		  libopencc-dev \
		  python3-tk \
		  libmecab-dev \
		  mecab \
    && rm -rf /var/lib/apt/lists/* \
    && apt-get clean
    
RUN cd /opt && \
    git clone --recursive --branch v0.11.1 https://github.com/pytorch/text torchtext && \
    cd torchtext && \
    python3 setup.py clean install 
    
RUN pip3 show torch torchvision torchaudio torchtext

# clone/build nemo
ARG NEMO_VERSION
RUN cd /opt && git clone --recursive --branch v${NEMO_VERSION} https://github.com/nvidia/nemo

# needed for nemo 1.0
#COPY patches/nemo/${NEMO_VERSION}/setup.py /opt/nemo/setup.py

# needed for nemo 1.6
COPY patches/nemo/${NEMO_VERSION}/requirements.txt /opt/nemo/requirements/requirements.txt
COPY patches/nemo/${NEMO_VERSION}/requirements_nlp.txt /opt/nemo/requirements/requirements_nlp.txt

RUN pip3 install -r /opt/nemo/requirements/requirements.txt --verbose
RUN pip3 install -r /opt/nemo/requirements/requirements_asr.txt --verbose
RUN pip3 install -r /opt/nemo/requirements/requirements_nlp.txt --verbose
RUN pip3 install -r /opt/nemo/requirements/requirements_tts.txt --verbose
#RUN pip3 install omegaconf==2.1.0dev24 --verbose

RUN cd /opt/nemo && python3 setup.py install --verbose


################################################################
## ctc-decoders
################################################################
RUN apt-get update && \
    apt-get install -y --no-install-recommends \
		  swig \
    && rm -rf /var/lib/apt/lists/* \
    && apt-get clean
    
RUN git clone https://github.com/dusty-nv/OpenSeq2Seq -b ctc-decoders && \
    cd OpenSeq2Seq/decoders && \
    ./setup.sh
    
RUN pip3 install git+https://github.com/NVIDIA/dllogger
RUN pip3 install nltk


################################################################
## Riva GRPC
################################################################
ARG RIVA_URL=https://nvidia.box.com/shared/static/cu8z4t1n6shkxl6z5nh9hpkpn9yxomcz.whl
ARG RIVA_WHL=riva_api-1.0.0ea-py3-none-any.whl

RUN wget --quiet --show-progress --progress=bar:force:noscroll --no-check-certificate ${RIVA_URL} -O ${RIVA_WHL} && \
    pip3 install ${RIVA_WHL} --verbose && \
    rm ${RIVA_WHL}


################################################################
## install some audio stuff
################################################################
RUN apt-get update && \
    apt-get install -y --no-install-recommends \
		  alsa-base \
            libasound2-dev \
            alsa-utils \
            portaudio19-dev \
		  libsndfile1 \
		  unzip \
    && rm -rf /var/lib/apt/lists/* \
    && apt-get clean
    
RUN pip3 install soundfile pyaudio wave


################################################################
## various patches to install
################################################################
#COPY patches patches

#RUN PYTHON_ROOT=`pip3 show torch | grep Location: | cut -d' ' -f2` && \
#    PYTORCH_VERSION=`pip3 show torch | grep Version: | cut -d' ' -f2` && \
#    TRANSFORMERS_VERSION=`pip3 show transformers | grep Version: | cut -d' ' -f2` && \
#    NEMO_PATH="$PYTHON_ROOT/nemo_toolkit-${NEMO_VERSION}-py3.6.egg/nemo" && \
#    echo "Python package root path:  $PYTHON_ROOT" && \
#    echo "Applying patches for PyTorch $PYTORCH_VERSION" && \
#    echo "Applying patches for transformers $TRANSFORMERS_VERSION" && \
#    cp patches/pytorch/$PYTORCH_VERSION/functional.py $PYTHON_ROOT/torch/functional.py && \
#    cp patches/transformers/$TRANSFORMERS_VERSION/convert_graph_to_onnx.py $PYTHON_ROOT/transformers/convert_graph_to_onnx.py && \
#    cp patches/transformers/$TRANSFORMERS_VERSION/modeling_distilbert.py $PYTHON_ROOT/transformers/models/distilbert/modeling_distilbert.py && \
#    cp patches/nemo/${NEMO_VERSION}/nlp/distilbert.py $NEMO_PATH/collections/nlp/modules/common/huggingface/distilbert.py && \
#    cp patches/nemo/${NEMO_VERSION}/exportable.py $NEMO_PATH/core/classes/exportable.py


# set Python to unicode
ENV PYTHONIOENCODING=utf-8

# disable JupyterLab from auto-starting (inherited behavior from l4t-ml)
CMD /bin/bash


================================================
FILE: Dockerfile.ros
================================================
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a
# copy of this software and associated documentation files (the "Software"),
# to deal in the Software without restriction, including without limitation
# the rights to use, copy, modify, merge, publish, distribute, sublicense,
# and/or sell copies of the Software, and to permit persons to whom the
# Software is furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
# DEALINGS IN THE SOFTWARE.

ARG BASE_IMAGE=jetson-voice:r32.5.0-foxy-base
FROM ${BASE_IMAGE}


################################################################
## install jetson_voice_ros package
################################################################
COPY ros /tmp/jetson_voice_ros
    
RUN source ${ROS_ROOT}/install/setup.bash && \
    mkdir -p ${ROS_ROOT}/src && \
    cd ${ROS_ROOT} && \
    cp -r /tmp/jetson_voice_ros src && \
    
    # build the package
    colcon build \
        --merge-install \
	    --base-paths src/jetson_voice_ros \
        --event-handlers console_direct+ && \
	  
    # clean-up build files
    rm -rf ${ROS_ROOT}/src && \
    rm -rf ${ROS_ROOT}/logs && \
    rm -rf ${ROS_ROOT}/build


################################################################
## project install
################################################################
ARG WORKSPACE=/jetson-voice

COPY jetson_voice ${WORKSPACE}/jetson_voice
COPY examples ${WORKSPACE}/examples
COPY scripts ${WORKSPACE}/scripts
COPY tests ${WORKSPACE}/tests

ENV PYTHONPATH="${WORKSPACE}:${PYTHONPATH}"


================================================
FILE: Dockerfile.runtime
================================================
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a
# copy of this software and associated documentation files (the "Software"),
# to deal in the Software without restriction, including without limitation
# the rights to use, copy, modify, merge, publish, distribute, sublicense,
# and/or sell copies of the Software, and to permit persons to whom the
# Software is furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
# DEALINGS IN THE SOFTWARE.

ARG BASE_IMAGE=jetson-voice:r32.5.0-base
FROM ${BASE_IMAGE}

ARG WORKSPACE=/jetson-voice
WORKDIR ${WORKSPACE}


################################################################
## project install
################################################################
COPY jetson_voice ${WORKSPACE}/jetson_voice
COPY examples ${WORKSPACE}/examples
COPY scripts ${WORKSPACE}/scripts
COPY tests ${WORKSPACE}/tests

ENV PYTHONPATH="${WORKSPACE}:${PYTHONPATH}"

================================================
FILE: Dockerfile.x86_64
================================================
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a
# copy of this software and associated documentation files (the "Software"),
# to deal in the Software without restriction, including without limitation
# the rights to use, copy, modify, merge, publish, distribute, sublicense,
# and/or sell copies of the Software, and to permit persons to whom the
# Software is furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
# DEALINGS IN THE SOFTWARE.

ARG BASE_IMAGE
FROM ${BASE_IMAGE}

ENV DEBIAN_FRONTEND=noninteractive
ENV SHELL /bin/bash
ARG MAKEFLAGS=-j$(nproc)
ARG WORKSPACE=/jetson-voice

WORKDIR ${WORKSPACE}


################################################################
## PyCUDA
################################################################
RUN pip3 install pycuda six --verbose


################################################################
## ctc-decoders
################################################################
RUN apt-get update && \
    apt-get install -y --no-install-recommends \
		  swig \
    && rm -rf /var/lib/apt/lists/* \
    && apt-get clean
    
RUN git clone https://github.com/dusty-nv/OpenSeq2Seq -b ctc-decoders && \
    cd OpenSeq2Seq/decoders && \
    ./setup.sh
    
RUN pip3 install git+https://github.com/NVIDIA/dllogger
RUN pip3 install nltk


################################################################
## Jarvis GRPC
################################################################
ARG JARVIS_URL=https://nvidia.box.com/shared/static/on9t7zqes2s6er1wpumidnc6rphwsyy7.whl
ARG JARVIS_WHL=jarvis_api-1.0.0b1-py3-none-any.whl

RUN wget --quiet --show-progress --progress=bar:force:noscroll --no-check-certificate ${JARVIS_URL} -O ${JARVIS_WHL} && \
    pip3 install ${JARVIS_WHL} --verbose && \
    rm ${JARVIS_WHL}
    
    
################################################################
## install some audio stuff
################################################################
RUN apt-get update && \
    apt-get install -y --no-install-recommends \
		  alsa-base \
            libasound2-dev \
            alsa-utils \
            portaudio19-dev \
		  libsndfile1 \
		  unzip \
		  tzdata \
		  nano \
    && rm -rf /var/lib/apt/lists/* \
    && apt-get clean
    
RUN pip3 install soundfile pyaudio wave


################################################################
## various patches to install
################################################################
COPY patches patches

ARG NEMO_VERSION
RUN PYTHON_ROOT=`pip3 show transformers | grep Location: | cut -d' ' -f2` && \
    TRANSFORMERS_VERSION=`pip3 show transformers | grep Version: | cut -d' ' -f2` && \
    echo "Python package root path:  $PYTHON_ROOT" && \
    echo "Applying patches for transformers $TRANSFORMERS_VERSION" && \
    cp patches/transformers/$TRANSFORMERS_VERSION/convert_graph_to_onnx.py $PYTHON_ROOT/transformers/convert_graph_to_onnx.py && \
    cp patches/transformers/$TRANSFORMERS_VERSION/modeling_distilbert.py $PYTHON_ROOT/transformers/models/distilbert/modeling_distilbert.py && \
    cp patches/nemo/${NEMO_VERSION}/nlp/distilbert.py $PYTHON_ROOT/nemo/collections/nlp/modules/common/huggingface/distilbert.py && \
    cp patches/nemo/${NEMO_VERSION}/exportable.py $PYTHON_ROOT/nemo/core/classes/exportable.py


# set Python to unicode
ENV PYTHONIOENCODING=utf-8
    

================================================
FILE: README.md
================================================
# jetson-voice

jetson-voice is an ASR/NLP/TTS deep learning inference library for Jetson Nano, TX1/TX2, Xavier NX, and AGX Xavier.  It supports Python and JetPack 4.4.1 or newer.  The DNN models were trained with [NeMo](https://github.com/NVIDIA/NeMo) and deployed with [TensorRT](https://developer.nvidia.com/tensorrt) for optimized performance.  All computation is performed using the onboard GPU.

Currently the following capabilities are included:

* [Automatic Speech Recognition (ASR)](#automatic-speech-recognition-asr)
	* [Streaming ASR (QuartzNet)](#automatic-speech-recognition-asr) 
	* [Command/Keyword Recognition (MatchboxNet)](#commandkeyword-recognition)
	* [Voice Activity Detection (VAD Marblenet)](#voice-activity-detection-vad)
* [Natural Language Processing (NLP)](#natural-language-processing-nlp)
	* [Joint Intent/Slot Classification](#joint-intentslot-classification)
	* [Text Classification (Sentiment Analysis)](#text-classification)
	* [Token Classification (Named Entity Recognition)](#token-classification)
	* [Question/Answering (QA)](#questionanswering)
* [Text-to-Speech (TTS)](#text-to-speech-tts)
	
The NLP models are using the [DistilBERT](https://arxiv.org/abs/1910.01108) transformer architecture for reduced memory usage and increased performance.  For samples of the text-to-speech output, see the [TTS Audio Samples](#tts-audio-samples) section below.

## Running the Container

jetson-voice is distributed as a Docker container due to the number of dependencies.  There are pre-built containers images available on DockerHub for JetPack 4.4.1 and newer:

```
dustynv/jetson-voice:r32.4.4    # JetPack 4.4.1 (L4T R32.4.4)
dustynv/jetson-voice:r32.5.0    # JetPack 4.5 (L4T R32.5.0) / JetPack 4.5.1 (L4T R32.5.1)
dustynv/jetson-voice:r32.6.1    # JetPack 4.6 (L4T R32.6.1)
dustynv/jetson-voice:r32.7.1    # JetPack 4.6.1 (L4T R32.7.1)
```

To download and run the container, you can simply clone this repo and use the `docker/run.sh` script:

``` bash
$ git clone --branch dev https://github.com/dusty-nv/jetson-voice
$ cd jetson-voice
$ docker/run.sh
```

> **note**:  if you want to use a USB microphone or speaker, plug it in *before* you start the container

There are some optional arguments to `docker/run.sh` that you can use:

* `-r` (`--run`) specifies a run command, otherwise the container will start in an interactive shell.
* `-v` (`--volume`) mount a directory from the host into the container (`/host/path:/container/path`)
* `--dev` starts the container in development mode, where all the source files are mounted for easy editing

The run script will automatically mount the `data/` directory into the container, which stores the models and other data files.  If you save files from the container there, they will also show up under `data/` on the host.

## Automatic Speech Recognition (ASR)

The speech recognition in jetson-voice is a streaming service, so it's intended to be used on live sources and transcribes the audio in 1-second chunks.  It uses a [QuartzNet-15x5](https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/asr/models.html#quartznet) model followed by a CTC beamsearch decoder and language model, to further refine the raw output of the network.  It detects breaks in the audio to determine the end of sentences.  For information about using the ASR APIs, please refer to [`jetson_voice/asr.py`](jetson_voice/asr.py) and see [`examples/asr.py`](examples/asr.py)

After you start the container, first run a test audio file (wav/ogg/flac) through [`examples/asr.py`](examples/asr.py) to verify that the system is functional.  Run this command (and all subsequent commands) inside the container:

``` bash
$ examples/asr.py --wav data/audio/dusty.wav

hi
hi hi this is dust
hi hi this is dusty check
hi hi this is dusty check one two
hi hi this is dusty check one two three
hi hi this is dusty check one two three.

what's the weather or
what's the weather going to be tomorrow
what's the weather going to be tomorrow in pittsburgh
what's the weather going to be tomorrow in pittsburgh.

today is
today is wednesday
today is wednesday tomorrow is thursday
today is wednesday tomorrow is thursday.

i would like
i would like to order a large
i would like to order a large pepperoni pizza
i would like to order a large pepperoni pizza.

is it going to be
is it going to be cloudy tomorrow.
```

> The first time you run each model, TensorRT will take a few minutes to optimize it.  
> This optimized model is then cached to disk, so the next time you run the model it will load faster.

#### Live Microphone

To test the ASR on a mic, first list the audio devices in your system to get the audio device ID's:

``` bash
$ scripts/list_audio_devices.sh

----------------------------------------------------
 Audio Input Devices
----------------------------------------------------
Input Device ID 1 - 'tegra-snd-t210ref-mobile-rt565x: - (hw:1,0)' (inputs=16) (sample_rate=44100)
Input Device ID 2 - 'tegra-snd-t210ref-mobile-rt565x: - (hw:1,1)' (inputs=16) (sample_rate=44100)
Input Device ID 3 - 'tegra-snd-t210ref-mobile-rt565x: - (hw:1,2)' (inputs=16) (sample_rate=44100)
Input Device ID 4 - 'tegra-snd-t210ref-mobile-rt565x: - (hw:1,3)' (inputs=16) (sample_rate=44100)
Input Device ID 5 - 'tegra-snd-t210ref-mobile-rt565x: - (hw:1,4)' (inputs=16) (sample_rate=44100)
Input Device ID 6 - 'tegra-snd-t210ref-mobile-rt565x: - (hw:1,5)' (inputs=16) (sample_rate=44100)
Input Device ID 7 - 'tegra-snd-t210ref-mobile-rt565x: - (hw:1,6)' (inputs=16) (sample_rate=44100)
Input Device ID 8 - 'tegra-snd-t210ref-mobile-rt565x: - (hw:1,7)' (inputs=16) (sample_rate=44100)
Input Device ID 9 - 'tegra-snd-t210ref-mobile-rt565x: - (hw:1,8)' (inputs=16) (sample_rate=44100)
Input Device ID 10 - 'tegra-snd-t210ref-mobile-rt565x: - (hw:1,9)' (inputs=16) (sample_rate=44100)
Input Device ID 11 - 'Logitech H570e Mono: USB Audio (hw:2,0)' (inputs=2) (sample_rate=44100)
Input Device ID 12 - 'Samson Meteor Mic: USB Audio (hw:3,0)' (inputs=2) (sample_rate=44100)
```

> If you don't see your audio device listed, exit and restart the container.  
> USB devices should be attached *before* the container is started.

Then run the ASR example with the `--mic <DEVICE>` option, and specify either the device ID or name:

``` bash
$ examples/asr.py --mic 11

hey
hey how are you guys
hey how are you guys.

# (Press Ctrl+C to exit)
```

## ASR Classification

There are other ASR models included for command/keyword recognition ([MatchboxNet](https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/asr/speech_classification/models.html#matchboxnet-speech-commands)) and voice activity detection ([VAD MarbleNet](https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/asr/speech_classification/models.html#marblenet-vad)).  These models are smaller and faster, and classify chunks of audio as opposed to transcribing text.  

### Command/Keyword Recognition

The [MatchboxNet](https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/asr/speech_classification/models.html#matchboxnet-speech-commands) model was trained on 12 keywords from the [Google Speech Commands](https://ai.googleblog.com/2017/08/launching-speech-commands-dataset.html) dataset:

```
# MatchboxNet classes
"yes",
"no",
"up",
"down",
"left",
"right",
"on",
"off",
"stop",
"go",
"unknown",
"silence"
```

You can run it through the same ASR example as above by specifying the `--model matchboxnet` argument:

``` bash
$ examples/asr.py --model matchboxnet --wav data/audio/commands.wav

class 'unknown' (0.384)
class 'yes' (1.000)
class 'no' (1.000)
class 'up' (1.000)
class 'down' (1.000)
class 'left' (1.000)
class 'left' (1.000)
class 'right' (1.000)
class 'on' (1.000)
class 'off' (1.000)
class 'stop' (1.000)
class 'go' (1.000)
class 'go' (1.000)
class 'silence' (0.639)
class 'silence' (0.576)
```

The numbers printed on the right are the classification probabilities between 0 and 1.

### Voice Activity Detection (VAD)

The voice activity model ([VAD MarbleNet](https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/asr/speech_classification/models.html#marblenet-vad)) is a binary model that outputs `background` or `speech`:

``` bash
$ examples/asr.py --model vad_marblenet --wav data/audio/commands.wav

class 'background' (0.969)
class 'background' (0.984)
class 'background' (0.987)
class 'speech' (0.997)
class 'speech' (1.000)
class 'speech' (1.000)
class 'speech' (0.998)
class 'background' (0.987)
class 'speech' (1.000)
class 'speech' (1.000)
class 'speech' (1.000)
class 'background' (0.988)
class 'background' (0.784)
```

## Natural Language Processing (NLP)

There are two samples included for NLP:

* [`examples/nlp.py`](examples/nlp.py) (intent/slot, text classification, token classification)
* [`examples/nlp_qa.py`](examples/nlp_qa.py) (question/answering)

These each use a [DistilBERT](https://arxiv.org/abs/1910.01108) model which has been fined-tuned for it's particular task.  For information about using the NLP APIs, please refer to [`jetson_voice/nlp.py`](jetson_voice/nlp.py) and see the samples above.

### Joint Intent/Slot Classification

Joint Intent and Slot classification is a task of classifying an Intent and detecting all relevant Slots (Entities) for this Intent in a query. For example, in the query: `What is the weather in Santa Clara tomorrow morning?`, we would like to classify the query as a `weather` Intent, and detect `Santa Clara` as a location slot and `tomorrow morning` as a date_time slot. 

Intents and Slots names are usually task specific and defined as labels in the training data.  The included intent/slot model was trained on the [NLU-Evaluation-Data](https://github.com/xliuhw/NLU-Evaluation-Data) dataset - you can find the various intent and slot classes that it supports [here](https://gist.github.com/dusty-nv/119474dfcf3bfccfbb8428951a64cd23).  They are common things that you might ask a virtual assistant:

```
$ examples/nlp.py --model distilbert_intent

Enter intent_slot query, or Q to quit:

> What is the weather in Santa Clara tomorrow morning?

{'intent': 'weather_query',
 'score': 0.7165476,
 'slots': [{'score': 0.6280392, 'slot': 'place_name', 'text': 'Santa'},
           {'score': 0.61760694, 'slot': 'place_name', 'text': 'Clara'},
           {'score': 0.5439486, 'slot': 'date', 'text': 'tomorrow'},
           {'score': 0.4520608, 'slot': 'date', 'text': 'morning'}]}

> Set an alarm for 730am

{'intent': 'alarm_set',
 'score': 0.5713072,
 'slots': [{'score': 0.40017933, 'slot': 'time', 'text': '730am'}]}

> Turn up the volume

{'intent': 'audio_volume_up', 'score': 0.33523008, 'slots': []}

> What is my schedule for tomorrow?

{'intent': 'calendar_query',
 'score': 0.37434494,
 'slots': [{'score': 0.5732627, 'slot': 'date', 'text': 'tomorrow'}]}

> Order a pepperoni pizza from domino's

{'intent': 'takeaway_order',
 'score': 0.50629586,
 'slots': [{'score': 0.27558547, 'slot': 'food_type', 'text': 'pepperoni'},
           {'score': 0.2778827, 'slot': 'food_type', 'text': 'pizza'},
           {'score': 0.21785143, 'slot': 'business_name', 'text': 'dominos'}]}
	
> Where's the closest Starbucks?

{'intent': 'recommendation_locations',
 'score': 0.5438984,
 'slots': [{'score': 0.1604197, 'slot': 'place_name', 'text': 'Starbucks'}]}

```

### Text Classification

In this text classification example, we'll use the included sentiment analysis model that was trained on the [Standford Sentiment Treebank (SST-2)](https://nlp.stanford.edu/sentiment/index.html) dataset.  It will label queries as either positive or negative, along with their probability:

```
$ examples/nlp.py --model distilbert_sentiment

Enter text_classification query, or Q to quit:

> today was warm, sunny and beautiful out

{'class': 1, 'label': '1', 'score': 0.9985898}

> today was cold and rainy and not very nice

{'class': 0, 'label': '0', 'score': 0.99136007}
```

(class 0 is negative sentiment and class 1 is positive sentiment)

### Token Classification

Whereas text classification classifies entire queries, token classification classifies individual tokens (or words).  In this example, we'll be performing Named Entity Recognition (NER), which is the task of detecting and classifying key information (entities) in text. For example, in a sentence: `Mary lives in Santa Clara and works at NVIDIA`, we should detect that `Mary` is a person, `Santa Clara` is a location and `NVIDIA` is a company.

The included token classification model for NER was trained on the [Groningen Meaning Bank (GMB)](http://www.let.rug.nl/bjerva/gmb/about.php) and supports the following annotations in [IOB format](https://en.wikipedia.org/wiki/Inside%E2%80%93outside%E2%80%93beginning_(tagging)) (short for inside, outside, beginning)

* LOC = Geographical Entity
* ORG = Organization
* PER = Person
* GPE = Geopolitical Entity
* TIME = Time indicator
* MISC = Artifact, Event, or Natural Phenomenon

``` bash
$ examples/nlp.py --model distilbert_ner

Enter token_classification query, or Q to quit:
> Mary lives in Santa Clara and works at NVIDIA

Mary[B-PER 0.989] lives in Santa[B-LOC 0.998] Clara[I-LOC 0.996] and works at NVIDIA[B-ORG 0.967]

> Lisa's favorite place to climb in the summer is El Capitan in Yosemite National Park in California, U.S.

Lisa's[B-PER 0.995] favorite place to climb in the summer[B-TIME 0.996] is El[B-PER 0.577] Capitan[I-PER 0.483] 
in Yosemite[B-LOC 0.987] National[I-LOC 0.988] Park[I-LOC 0.98] in California[B-LOC 0.998], U.S[B-LOC 0.997].
```

### Question/Answering

Question/Answering (QA) works by supplying a context paragraph which the model then queries the best answer from.  The [`nlp_qa.py`](examples/nlp_qa.py) example allows you to select from several built-in context paragraphs (or supply your own) and to ask questions about these topics.  

The QA model is flexible and doesn't need re-trained on different topics, as it was trained on the [SQuAD](https://rajpurkar.github.io/SQuAD-explorer/) question/answering dataset which allows it to extract answers from a variety of contexts.  It essentially learns to identify the information most relevant to your query from the context passage, as opposed to learning the content itself.

``` bash
$ examples/nlp_qa.py 

Context:
The Amazon rainforest is a moist broadleaf forest that covers most of the Amazon basin of South America. 
This basin encompasses 7,000,000 square kilometres (2,700,000 sq mi), of which 5,500,000 square kilometres 
(2,100,000 sq mi) are covered by the rainforest. The majority of the forest is contained within Brazil, 
with 60% of the rainforest, followed by Peru with 13%, and Colombia with 10%.

Enter a question, C to change context, P to print context, or Q to quit:

> How big is the Amazon?

Answer: 7,000,000 square kilometres
Score:  0.24993503093719482

> which country has the most?

Answer: Brazil
Score:  0.5964332222938538
```

To change the topic or create one of your own, enter `C`:

```
Enter a question, C to change context, P to print context, or Q to quit:
> C

Select from one of the following topics, or enter your own context paragraph:
   1. Amazon
   2. Geology
   3. Moon Landing
   4. Pi
   5. Super Bowl 55
> 3

Context:
The first manned Moon landing was Apollo 11 on July, 20 1969. The first human to step on the Moon was 
astronaut Neil Armstrong followed second by Buzz Aldrin. They landed in the Sea of Tranquility with their 
lunar module the Eagle. They were on the lunar surface for 2.25 hours and collected 50 pounds of moon rocks.

Enter a question, C to change context, P to print context, or Q to quit:

> Who was the first man on the moon?

Answer: Neil Armstrong
Score:  0.39105066657066345
```

## Text-to-Speech (TTS)

The text-to-speech service uses an ensemble of two models:  FastPitch to generate MEL spectrograms from text, and HiFiGAN as the vocoder (female English voice).  For information about using the TTS APIs, please refer to [`jetson_voice/tts.py`](jetson_voice/tts.py) and see [`examples/tts.py`](examples/tts.py)

The [`examples/tts.py`](examples/tts.py) app can output the audio to a speaker, wav file, or sequence of wav files.  Run it with `--list-devices` to get a list of your audio devices.

``` bash
$ examples/tts.py --output-device 11 --output-wav data/audio/tts_test

> The weather tomorrow is forecast to be warm and sunny with a high of 83 degrees.

Run 0 -- Time to first audio: 1.820s. Generated 5.36s of audio. RTFx=2.95.
Run 1 -- Time to first audio: 0.232s. Generated 5.36s of audio. RTFx=23.15.
Run 2 -- Time to first audio: 0.230s. Generated 5.36s of audio. RTFx=23.31.
Run 3 -- Time to first audio: 0.231s. Generated 5.36s of audio. RTFx=23.25.
Run 4 -- Time to first audio: 0.230s. Generated 5.36s of audio. RTFx=23.36.
Run 5 -- Time to first audio: 0.230s. Generated 5.36s of audio. RTFx=23.35.

Wrote audio to data/audio/tts_test/0.wav

Enter text, or Q to quit:
> Sally sells seashells by the seashore.

Run 0 -- Time to first audio: 0.316s. Generated 2.73s of audio. RTFx=8.63.
Run 1 -- Time to first audio: 0.126s. Generated 2.73s of audio. RTFx=21.61.
Run 2 -- Time to first audio: 0.127s. Generated 2.73s of audio. RTFx=21.51.
Run 3 -- Time to first audio: 0.126s. Generated 2.73s of audio. RTFx=21.68.
Run 4 -- Time to first audio: 0.126s. Generated 2.73s of audio. RTFx=21.68.
Run 5 -- Time to first audio: 0.126s. Generated 2.73s of audio. RTFx=21.61.

Wrote audio to data/audio/tts_test/1.wav
```

#### TTS Audio Samples

* [Weather forecast](data/audio/tts_examples/0.wav) (wav)
* [Sally sells seashells](data/audio/tts_examples/1.wav) (wav)


## Tests

There is an automated test suite included that will verify all of the models are working properly.  You can run it with the `tests/run_tests.py` script:

``` bash
$ tests/run_tests.py

----------------------------------------------------
 TEST SUMMARY
----------------------------------------------------
test_asr.py (quartznet)                  PASSED
test_asr.py (quartznet_greedy)           PASSED
test_asr.py (matchboxnet)                PASSED
test_asr.py (vad_marblenet)              PASSED
test_nlp.py (distilbert_qa_128)          PASSED
test_nlp.py (distilbert_qa_384)          PASSED
test_nlp.py (distilbert_intent)          PASSED
test_nlp.py (distilbert_sentiment)       PASSED
test_nlp.py (distilbert_ner)             PASSED
test_tts.py (fastpitch_hifigan)          PASSED

passed 10 of 10 tests
saved logs to data/tests/logs/20210610_1512
```

The logs of the individual tests are printed to the screen and saved to a timestamped directory.


================================================
FILE: docker/build.sh
================================================
#!/usr/bin/env bash

ROS_DISTRO=${1:-"none"}
BASE_IMAGE=$2
NEMO_VERSION="1.0.0rc1"

# find container tag from os version
source docker/tag.sh

if [ $ARCH = "aarch64" ]; then
	if [ -z $BASE_IMAGE ]; then
		if [ $L4T_VERSION = "32.7.1" ]; then
			BASE_IMAGE="l4t-ml:r32.7.1-py3"
			#BASE_IMAGE="nvcr.io/nvidia/l4t-ml:r32.7.1-py3"
			NEMO_VERSION="1.6.2"
		elif [ $L4T_VERSION = "32.6.1" ]; then
			BASE_IMAGE="nvcr.io/nvidia/l4t-ml:r32.6.1-py3"
		elif [ $L4T_VERSION = "32.5.0" ] || [ $L4T_VERSION = "32.5.1" ]; then
			BASE_IMAGE="nvcr.io/nvidia/l4t-ml:r32.5.0-py3"
		elif [ $L4T_VERSION = "32.4.4" ]; then
			BASE_IMAGE="nvcr.io/nvidia/l4t-ml:r32.4.4-py3"
		elif [ $L4T_VERSION = "32.4.3" ]; then
			BASE_IMAGE="nvcr.io/nvidia/l4t-ml:r32.4.3-py3"
		elif [ $L4T_VERSION = "32.4.2" ]; then
			BASE_IMAGE="nvcr.io/nvidia/l4t-ml:r32.4.2-py3"
		else
			echo "cannot build jetson-voice docker container for L4T R$L4T_VERSION"
			echo "please upgrade to the latest JetPack, or build jetson-voice natively"
			exit 1
		fi
	fi
elif [ $ARCH = "x86_64" ]; then
	BASE_IMAGE=${BASE_IMAGE:-"nvcr.io/nvidia/nemo:$NEMO_VERSION"}
fi

VOICE_CONTAINER="$CONTAINER_NAME:$TAG"
VOICE_CONTAINER_BASE="$VOICE_CONTAINER-base"

# build the base container
echo "CONTAINER=$VOICE_CONTAINER_BASE"
echo "BASE_IMAGE=$BASE_IMAGE"

sudo docker build -t $VOICE_CONTAINER_BASE -f Dockerfile.$ARCH \
          --build-arg BASE_IMAGE=$BASE_IMAGE \
		--build-arg NEMO_VERSION=$NEMO_VERSION \
		.
		
# build the runtime container
echo "CONTAINER=$VOICE_CONTAINER"
echo "BASE_IMAGE=$VOICE_CONTAINER_BASE"

sudo docker build -t $VOICE_CONTAINER -f Dockerfile.runtime \
          --build-arg BASE_IMAGE=$VOICE_CONTAINER_BASE \
		.

# build ROS version of container
if [[ "$ROS_DISTRO" != "none" ]] && [[ $ARCH = "aarch64" ]]; then
	ROS_CONTAINER="$VOICE_CONTAINER-ros-$ROS_DISTRO"
	ROS_CONTAINER_BASE="$ROS_CONTAINER-base"
	
	# copy files needed to build ROS container
	if [ ! -d "packages/" ]; then
		cp -r docker/containers/packages packages
	fi
	
	# opencv.csv mounts files that preclude us installing different version of opencv
	# temporarily disable the opencv.csv mounts while we build the container
	CV_CSV="/etc/nvidia-container-runtime/host-files-for-container.d/opencv.csv"

	if [ -f "$CV_CSV" ]; then
		sudo mv $CV_CSV $CV_CSV.backup
	fi
	
	# build ROS on top of jetson-voice 
	echo "CONTAINER=$ROS_CONTAINER_BASE"
	echo "BASE_IMAGE=$VOICE_CONTAINER_BASE"

	sudo docker build -t $ROS_CONTAINER_BASE -f docker/containers/Dockerfile.ros.$ROS_DISTRO \
          --build-arg BASE_IMAGE=$VOICE_CONTAINER_BASE \
		.
	
	# install jetson_voice_ros package
	echo "CONTAINER=$ROS_CONTAINER"
	echo "BASE_IMAGE=$ROS_CONTAINER_BASE"

	sudo docker build -t $ROS_CONTAINER -f Dockerfile.ros \
          --build-arg BASE_IMAGE=$ROS_CONTAINER_BASE \
		.
		
	# restore opencv.csv mounts
	if [ -f "$CV_CSV.backup" ]; then
		sudo mv $CV_CSV.backup $CV_CSV
	fi
fi

================================================
FILE: docker/push.sh
================================================
#!/usr/bin/env bash

ROS_DISTRO=${1:-"foxy"}
source docker/tag.sh

# push image
push() 
{
	local remote_image="dustynv/$1"
	
	sudo docker rmi $remote_image
	sudo docker tag $1 $remote_image
	
	echo "pushing image $remote_image"
	sudo docker push $remote_image
	echo "done pushing image $remote_image"
}

push "$CONTAINER_NAME:$TAG"

ROS_CONTAINER="$CONTAINER_NAME:$TAG-ros-$ROS_DISTRO"
push "$ROS_CONTAINER"

================================================
FILE: docker/run.sh
================================================
#!/usr/bin/env bash
#
# Start an instance of the jetson-voice docker container.
# See below or run this script with -h or --help to see usage options.
#
# This script should be run from the root dir of the jetson-voice project:
#
#     $ cd /path/to/your/jetson-voice
#     $ docker/run.sh
#

show_help() {
    echo " "
    echo "usage: Starts the Docker container and runs a user-specified command"
    echo " "
    echo "   ./docker/run.sh --container DOCKER_IMAGE"
    echo "                   --volume HOST_DIR:MOUNT_DIR"
    echo "                   --run RUN_COMMAND"
    echo " "
    echo "args:"
    echo " "
    echo "   --help                       Show this help text and quit"
    echo " "
    echo "   -c, --container DOCKER_IMAGE Specifies the name of the Docker container"
    echo "                                image to use (default: 'jetson-voice')"
    echo " "
    echo "   --ros ROS_DISTRO Starts the version of the container using the"
    echo "                    specified ROS distro (or foxy if not specified)"
    echo "                    This is overridden by the --container argument"
    echo " "
    echo "   -d, --dev  Runs the container in development mode, where the source"
    echo "              files are mounted into the container dynamically, so they"
    echo "              can more easily be edited from the host machine."
    echo " "
    echo "   -v, --volume HOST_DIR:MOUNT_DIR Mount a path from the host system into"
    echo "                                   the container.  Should be specified as:"
    echo " "
    echo "                                      -v /my/host/path:/my/container/path"
    echo " "
    echo "                                   (these should be absolute paths)"
    echo " "
    echo "   -r, --run RUN_COMMAND  Command to run once the container is started."
    echo "                          Note that this argument must be invoked last,"
    echo "                          as all further arguments will form the command."
    echo "                          If no run command is specified, an interactive"
    echo "                          terminal into the container will be provided."
    echo " "
}

die() {
    printf '%s\n' "$1"
    show_help
    exit 1
}

# find container tag from os version
source docker/tag.sh

# where the project resides inside docker
DOCKER_ROOT="/jetson-voice"	

# generate mount commands
DATA_VOLUME="--volume $PWD/data:$DOCKER_ROOT/data"
DEV_VOLUME=""

# parse user arguments
USER_VOLUME=""
USER_COMMAND=""

while :; do
    case $1 in
        -h|-\?|--help)
            show_help    # Display a usage synopsis.
            exit
            ;;
        -c|--container)       # Takes an option argument; ensure it has been specified.
            if [ "$2" ]; then
                CONTAINER_IMAGE=$2
                shift
            else
                die 'ERROR: "--container" requires a non-empty option argument.'
            fi
            ;;
        --container=?*)
            CONTAINER_IMAGE=${1#*=} # Delete everything up to "=" and assign the remainder.
            ;;
        --container=)         # Handle the case of an empty --image=
            die 'ERROR: "--container" requires a non-empty option argument.'
            ;;
	   --ros)
            if [ "$2" ]; then
                ROS_DISTRO=$2
                shift
            else
                ROS_DISTRO="foxy"
            fi
            ;;
        --ros=?*)
            ROS_DISTRO=${1#*=} # Delete everything up to "=" and assign the remainder.
            ;;
        --ros=)         # Handle the case of an empty --image=
            ROS_DISTRO="foxy"
            ;;
	   -d|--dev)
            DEV_VOLUME="--volume $PWD/jetson_voice:$DOCKER_ROOT/jetson_voice --volume $PWD/examples:$DOCKER_ROOT/examples --volume $PWD/scripts:$DOCKER_ROOT/scripts --volume $PWD/tests:$DOCKER_ROOT/tests"
            ;;
        -v|--volume)
            if [ "$2" ]; then
                USER_VOLUME=" -v $2 "
                shift
            else
                die 'ERROR: "--volume" requires a non-empty option argument.'
            fi
            ;;
        --volume=?*)
            USER_VOLUME=" -v ${1#*=} " # Delete everything up to "=" and assign the remainder.
            ;;
        --volume=)         # Handle the case of an empty --image=
            die 'ERROR: "--volume" requires a non-empty option argument.'
            ;;
        -r|--run)
            if [ "$2" ]; then
                shift
                USER_COMMAND=" $@ "
            else
                die 'ERROR: "--run" requires a non-empty option argument.'
            fi
            ;;
        --)              # End of all options.
            shift
            break
            ;;
        -?*)
            printf 'WARN: Unknown option (ignored): %s\n' "$1" >&2
            ;;
        *)               # Default case: No more options, so break out of the loop.
            break
    esac

    shift
done

# select the container, unless --container was explicitly specified
if [ -z "$CONTAINER_IMAGE" ]; then
	CONTAINER_IMAGE="$CONTAINER_NAME:$TAG"

	if [ -n "$ROS_DISTRO" ]; then
		CONTAINER_IMAGE="$CONTAINER_NAME:$TAG-ros-$ROS_DISTRO"
	fi

	CONTAINER_REMOTE_IMAGE="dustynv/$CONTAINER_IMAGE"

	# check for local image
	if [[ "$(sudo docker images -q $CONTAINER_IMAGE 2> /dev/null)" == "" ]]; then
		CONTAINER_IMAGE=$CONTAINER_REMOTE_IMAGE
	fi
fi

echo "CONTAINER:     $CONTAINER_IMAGE"
echo "DEV_VOLUME:    $DEV_VOLUME"
echo "DATA_VOLUME:   $DATA_VOLUME"
echo "USER_VOLUME:   $USER_VOLUME"
echo "USER_COMMAND:  $USER_COMMAND"

MOUNTS="\
--device /dev/snd \
--device /dev/bus/usb \
--volume /etc/timezone:/etc/timezone:ro \
--volume /etc/localtime:/etc/localtime:ro \
$DEV_VOLUME \
$DATA_VOLUME \
$USER_VOLUME"

if [ $ARCH = "aarch64" ]; then

	sudo docker run --runtime nvidia -it --rm \
		--name=$CONTAINER_NAME \
		--network host \
		$MOUNTS $CONTAINER_IMAGE $USER_COMMAND
	    
elif [ $ARCH = "x86_64" ]; then

	sudo docker run --gpus all -it --rm \
		--name=$CONTAINER_NAME \
		--network=host \
		--shm-size=8g \
		--ulimit memlock=-1 \
		--ulimit stack=67108864 \
		$MOUNTS $CONTAINER_IMAGE $USER_COMMAND
		
fi


================================================
FILE: docker/tag.sh
================================================
#!/usr/bin/env bash

# find OS version
source scripts/os_version.sh

if [ $ARCH = "aarch64" ]; then
	TAG="r$L4T_VERSION"
	
	if [ $L4T_VERSION = "32.5.1" ] || [ $L4T_VERSION = "32.5.2" ]; then
		TAG="r32.5.0"
	fi	
elif [ $ARCH = "x86_64" ]; then
	TAG="$ARCH"
else
	echo "unsupported architecture:  $ARCH"
	exit 1
fi

CONTAINER_NAME="jetson-voice"


================================================
FILE: examples/asr.py
================================================
#!/usr/bin/env python3
# coding: utf-8

import sys

from jetson_voice import ASR, AudioInput, ConfigArgParser, list_audio_devices
    
    
parser = ConfigArgParser()

parser.add_argument('--model', default='quartznet', type=str, help='path to model, service name, or json config file')
parser.add_argument('--wav', default=None, type=str, help='path to input wav/ogg/flac file')
parser.add_argument('--mic', default=None, type=str, help='device name or number of input microphone')
parser.add_argument('--list-devices', action='store_true', help='list audio input devices')

args = parser.parse_args()
print(args)
    
# list audio devices
if args.list_devices:
    list_audio_devices()
    sys.exit()
    
# load the model
asr = ASR(args.model)

# create the audio input stream
stream = AudioInput(wav=args.wav, mic=args.mic, 
                     sample_rate=asr.sample_rate, 
                     chunk_size=asr.chunk_size)

# run transcription
for samples in stream:
    results = asr(samples)
    
    if asr.classification:
        print(f"class '{results[0]}' ({results[1]:.3f})")
    else:
        for transcript in results:
            print(transcript['text'])
            
            if transcript['end']:
                print('')
                
print('\naudio stream closed.')
    

================================================
FILE: examples/assistant.py
================================================
#!/usr/bin/env python3
# coding: utf-8

import sys
import pprint

from jetson_voice import (
    ASR, NLP, TTS, 
    AudioInput, AudioOutput, list_audio_devices,
    ConfigArgParser
)
       
parser = ConfigArgParser()

parser.add_argument('--asr-model', default='quartznet', type=str, help='ASR model')
parser.add_argument('--nlp-model', default='distilbert_intent', type=str, help='NLP model')
parser.add_argument('--tts-model', default='fastpitch_hifigan', type=str, help='TTS model')
parser.add_argument('--wav', default=None, type=str, help='path to input wav/ogg/flac file')
parser.add_argument('--mic', default=None, type=str, help='device name or number of input microphone')
parser.add_argument('--output-device', default=None, type=str, help='device name or number of audio output')
parser.add_argument('--list-devices', action='store_true', help='list audio input devices')

args = parser.parse_args()
print(args)
    
# list audio devices
if args.list_devices:
    list_audio_devices()
    sys.exit()
    
# load the models
tts = TTS(args.tts_model)
asr = ASR(args.asr_model, add_punctuation=False)
nlp = NLP(args.nlp_model)

if asr.classification:
    raise ValueError(f"'{args.asr_model}' is a classification model - must use a transcription model for agent")

if nlp.config.type != 'intent_slot':
    raise ValueError(f"'{args.nlp_model}' has type '{nlp.config.type}' - the agent requires an intent_slot model")
    
# create the audio streams
audio_input = AudioInput(wav=args.wav, mic=args.mic, 
                         sample_rate=asr.sample_rate, 
                         chunk_size=asr.chunk_size)

audio_output = AudioOutput(device=args.output_device,
                           sample_rate=tts.sample_rate)


def get_slot(results, name, default='', threshold=0, merge=True):
    """
    Retrieve a slot by name from the intent/slot results.
    The name can be a list of names, and any of them will be matched.
    Only slots with a score above the threshold will be returned.
    If merge is true, all slots by that name will be combined.
    If merge is false, the first matching slot will be returned.
    """
    if isinstance(name, str):
        name = [name]
        
    slots = []

    for slot in results['slots']:
        if any(slot['slot'] == n for n in name) and slot['score'] >= threshold:
            slots.append(slot['text'])
            
    if len(slots) == 0:
        return default
        
    if len(slots) > 1 and merge:
        return ' '.join(slots)
        
    return slots[0]
      
      
def generate_response(query):
    results = nlp(query)
    pprint.pprint(results)
    
    intent = results['intent']
    
    if intent == 'general_praise':
        return "Why thank you very much!"
        
    elif intent == 'weather_query':
        place = get_slot(results, 'place_name')
        date = get_slot(results, 'date')
        
        response = "The weather "
        
        if place: response += 'in ' + place + ' '
        if date:  response += date + ' '
        
        return response + "is forecast to be sunny with a high of 78 degrees."
        
    elif intent == 'recommendation_locations':
        place = get_slot(results, ['place_name', 'business_name'])
        
        if not place:
            return "Please ask again with the name of a store or restaurant."
          
        return f"{place} is located 1 mile away at 1 2 3 Main Street."
        
    return "I'm sorry, I don't understand."
    
# run agent
for input_samples in audio_input:
    transcripts = asr(input_samples)

    for transcript in transcripts:
        print(transcript['text'])
        
        if not transcript['end']:
            continue
            
        print('')
        
        response = generate_response(transcript['text'])
        print(response)
        
        audio_output.write(tts(response))

    """
    if transcripts[0] != 'unknown' and transcripts[1] != 'silence':
        response = generate_response(transcripts[0])
        print(response)
        
        audio_output.write(tts(response))
    """

================================================
FILE: examples/nlp.py
================================================
#!/usr/bin/env python3
# coding: utf-8

import sys
import pprint
import readline

from jetson_voice import NLP, ConfigArgParser


parser = ConfigArgParser()
parser.add_argument('--model', default='distilbert_sentiment', type=str)
args = parser.parse_args()
print(args)

# load the model
model = NLP(args.model)

# QA models should run the nlp_qa.py example
type = model.config.type

if type == 'qa':
    raise ValueError("please run Question/Answer models with the nlp_qa.py sample")


while True:
    print(f'\nEnter {type} query, or Q to quit:')
    query = input('> ')
    
    if query.upper() == 'Q':
        sys.exit()
    
    print('')
    
    results = model(query)
        
    if type == 'intent_slot' or type == 'text_classification':
        pprint.pprint(results)
    
    elif type == 'token_classification':
        print(f'{model.tag_string(query, results, scores=True)}')
        

================================================
FILE: examples/nlp_qa.py
================================================
#!/usr/bin/env python3
# coding: utf-8

import sys
import readline

from jetson_voice import QuestionAnswer, ConfigArgParser

parser = ConfigArgParser()
parser.add_argument('--model', default='distilbert_qa_384', type=str)
parser.add_argument('--top_k', default=1, type=int, help='show the top N answers (default 1)')
args = parser.parse_args()
print(args)

model = QuestionAnswer(args.model)  # load the QA model

builtin_context = {
    "Amazon" : "The Amazon rainforest is a moist broadleaf forest that covers most of the Amazon basin of South America. "
               "This basin encompasses 7,000,000 square kilometres (2,700,000 sq mi), of which 5,500,000 square kilometres "
               "(2,100,000 sq mi) are covered by the rainforest. The majority of the forest is contained within Brazil, "
               "with 60% of the rainforest, followed by Peru with 13%, and Colombia with 10%.",
    
    "Geology" : "There are three major types of rock: igneous, sedimentary, and metamorphic. Igneous rocks are formed from "
                "melted rock deep inside the Earth. Sedimentary rocks are compressed layers of sand, silt, dead plants, and "
                "animal skeletons. Metamorphic rocks are other rocks that are changed by heat and pressure underground.",
    
    "Moon Landing" : "The first manned Moon landing was Apollo 11 on July, 20 1969. The first human to step on the Moon was "
                     "astronaut Neil Armstrong followed second by Buzz Aldrin. They landed in the Sea of Tranquility with their "
                     "lunar module the Eagle. They were on the lunar surface for 2.25 hours and collected 50 pounds of moon rocks.",
           
    "Pi" : "Some people have said that Pi is tasty but there should be a value for Pi, and the value for Pi is around 3.14. "
           "Pi is the ratio of a circle's circumference to it's diameter. The constant Pi was first calculated by Archimedes "
           "in ancient Greece around the year 250 BC.",
           
    "Super Bowl 55" : "Super Bowl 55 took place on February 7, 2021 in Tampa, Florida between the Kansas City Chiefs and "
                      "the Tampa Bay Buccaneers.  The Tampa Bay Buccaneers won by a score of 31 to 9. In his first season "
                      "with Tampa Bay, it was quarterback Tom Brady's seventh Super Bowl win in nine appearances.",
}

context = builtin_context['Amazon']

def print_context():
    print('\nContext:')
    print(context)
    
def parse_commands(entry):
    """
    Parse 'C' command for changing context, 'P' to print context, and 'Q' for quit.
    Returns true if a command was entered, otherwise false.
    """
    global context

    if entry == 'C':
        print('\nSelect from one of the following topics, or enter your own context paragraph:')
        for idx, key in enumerate(builtin_context):
            print(f'   {idx+1}. {key}')
        entry = input('> ')
        try:  # try parsing as a number
            num = int(entry)
            if num > 0 and num <= len(builtin_context):
                context = builtin_context[list(builtin_context.keys())[num-1]]
            else:
                print('Invalid entry')
        except:  # try looking up topic name, otherwise custom paragraph
            if entry in builtin_context:
                context = builtin_context[entry.lower()]
            else:
                context = entry
                
        print_context()
        return True
        
    elif entry == 'P':
        print_context()
        return True
    elif entry == 'Q':
        sys.exit()
        
    return False
    
print_context()

while True:
    print('\nEnter a question, C to change context, P to print context, or Q to quit:')
    entry = input('> ')
    
    if parse_commands(entry.upper()):
        continue
    
    query = {
        'context' : context,
        'question' : entry
    }
    
    results = model(query, top_k=args.top_k)
    
    if args.top_k == 1:
        results = [results]
        
    for result in results:
        print('\nAnswer:', result['answer'])
        print('Score: ', result['score'])
        

================================================
FILE: examples/tts.py
================================================
#!/usr/bin/env python3
# coding: utf-8

import os
import sys
import time
import readline

from jetson_voice import TTS, ConfigArgParser, AudioOutput, list_audio_devices
from soundfile import SoundFile


parser = ConfigArgParser()

parser.add_argument('--model', default='fastpitch_hifigan', type=str)
parser.add_argument('--warmup', default=5, type=int, help='the number of warmup runs')
parser.add_argument("--output-device", default=None, type=str, help='output audio device to use')
parser.add_argument("--output-wav", default=None, type=str, help='output directory or wav file to write to')
parser.add_argument('--list-devices', action='store_true', help='list audio input devices')

args = parser.parse_args()
print(args)

# list audio devices
if args.list_devices:
    list_audio_devices()
    sys.exit()
    
# load the model
tts = TTS(args.model)

# open output audio device
if args.output_device:
    audio_device = AudioOutput(args.output_device, tts.sample_rate)

# create output wav directory
if args.output_wav:
    wav_is_dir = len(os.path.splitext(args.output_wav)[1]) == 0
    wav_count = 0
    if wav_is_dir and not os.path.exists(args.output_wav):
        os.makedirs(args.output_wav)


while True:
    print(f'\nEnter text, or Q to quit:')
    text = input('> ')
    
    if text.upper() == 'Q':
        sys.exit()
    
    print('')
    
    # run the TTS
    for run in range(args.warmup+1):
        start = time.perf_counter()
        audio = tts(text)
        stop = time.perf_counter()
        latency = stop-start
        duration = audio.shape[0]/tts.sample_rate
        print(f"Run {run} -- Time to first audio: {latency:.3f}s. Generated {duration:.2f}s of audio. RTFx={duration/latency:.2f}.")
        
    # output the audio
    if args.output_device:
        audio_device.write(audio)
    
    if args.output_wav:
        wav_path = os.path.join(args.output_wav, f'{wav_count}.wav') if wav_is_dir else args.output_wav
        wav = SoundFile(wav_path, mode='w', samplerate=tts.sample_rate, channels=1)
        wav.write(audio)
        wav.close()
        wav_count += 1
        print(f"\nWrote audio to {wav_path}")

    
================================================
FILE: jetson_voice/__init__.py
================================================
#!/usr/bin/env python3
# coding: utf-8

from .utils import (
    find_resource, list_models, global_config, ConfigDict, ConfigArgParser,
    list_audio_devices, list_audio_inputs, list_audio_outputs, AudioInput, AudioOutput 
)

from .asr import ASR, ASRService
from .tts import TTS, TTSService

from .nlp import (NLP,
    IntentSlot, IntentSlotService, 
    QuestionAnswer, QuestionAnswerService,
    TextClassification, TextClassificationService,
    TokenClassification, TokenClassificationService,
)

from .auto import AutoModel

__version__ = global_config.version


================================================
FILE: jetson_voice/asr.py
================================================
#!/usr/bin/env python3
# coding: utf-8

from jetson_voice.utils import load_resource


def ASR(resource, *args, **kwargs):
    """
    Loads a streaming ASR service or model.
    See the ASRService class for the signature that implementations use.
    """
    factory_map = {
        'riva' : 'jetson_voice.backends.riva.RivaASRService',
        'tensorrt' : 'jetson_voice.models.asr.ASREngine',
        'onnxruntime' : 'jetson_voice.models.asr.ASREngine'
    }
    
    return load_resource(resource, factory_map, *args, **kwargs)

    
class ASRService():
    """
    Streaming ASR service base class.
    """
    def __init__(self, config, *args, **kwargs):
        self.config = config
        
    def __call__(self, samples):
        """
        Transcribe streaming audio samples to text, returning the running phrase.
        Phrases are broken up when a break in the audio is detected (i.e. end of sentence)
        
        Parameters:
          samples (array) -- Numpy array of audio samples.

        Returns a list[dict] of the running transcripts with the following keys:
        
          text (string) -- the transcript of the current sentence
          words (list[dict]) -- a list of word dicts that make up the sentence
          end (bool) -- if true, end-of-sentence due to silence
          
        Each transcript represents one phrase/sentence.  When a sentence has been determined
        to be ended, it will be marked with end=True.  Multiple sentence transcripts can be 
        returned if one just ended and another is beginning. 
        """
        pass
    
    @property
    def classification(self):
        """
        Returns true if this is an ASR classification model (e.g. for VAD or keyword spotting)
        Otherwise, this is an ASR transcription model that converts audio to text.
        """
        return False
        
    @property
    def sample_rate(self):
        """
        The sample rate that the model runs at (in Hz)
        Input audio should be resampled to this rate.
        """
        pass
    
    @property
    def frame_length(self):
        """
        Duration in seconds per frame / chunk.
        """
        pass
        
    @property
    def chunk_size(self):
        """
        Number of samples per frame/chunk (equal to frame_length * sample_rate)
        """
        pass
        
        
if __name__ == "__main__":

    from jetson_voice import list_audio_devices, AudioInput, ConfigArgParser
    import sys
    
    parser = ConfigArgParser()
    
    parser.add_argument('--model', default='quartznet', type=str, help='path to model, service name, or json config file')
    parser.add_argument('--wav', default=None, type=str, help='path to input wav file')
    parser.add_argument('--mic', default=None, type=str, help='device name or number of input microphone')
    parser.add_argument('--list-devices', action='store_true', help='list audio input devices')
    
    args = parser.parse_args()
    print(args)
    
    # list audio devices
    if args.list_devices:
        list_audio_devices()
        sys.exit()
        
    # load the model
    asr = ASR(args.model)
    
    # create the audio input stream
    stream = AudioInput(wav=args.wav, mic=args.mic, 
                         sample_rate=asr.sample_rate, 
                         chunk_size=asr.chunk_size)
    
    # run transcription
    for samples in stream:
        #samples = audio_to_float(samples)
        #print(f'samples {samples.shape} ({audio_db(samples):.1f} dB)')
        results = asr(samples)
        
        if asr.classification:
            print(f"class '{results[0]}' ({results[1]:.3f})")
        else:
            for transcript in results:
                print(transcript['text'])
                
                if transcript['end']:
                    print('')
                    
    print('\naudio stream closed.')
    

================================================
FILE: jetson_voice/auto.py
================================================
#!/usr/bin/env python3
# coding: utf-8

from jetson_voice.asr import ASR
from jetson_voice.nlp import IntentSlot, QuestionAnswer, TextClassification, TokenClassification
from jetson_voice.tts import TTS

from jetson_voice.utils import load_resource


def AutoModel(resource, domain=None, *args, **kwargs):
    """
    Factory for automatically loading models and services.
    First the config is loaded and the type is checked.
    Then the correct instance for the resource is created.
    
    If a domain string is supplied (e.g. 'asr', 'nlp', 'tts'),
    then only resources from that domain will be created.
    """
    type_map = {
        # models
        'asr' : (ASR, 'asr'),
        'asr_classification' : (ASR, 'asr'),
        'intent_slot' : (IntentSlot, 'nlp'),
        'qa' : (QuestionAnswer, 'nlp'),
        'text_classification' : (TextClassification, 'nlp'),
        'token_classification' : (TokenClassification, 'nlp'),
        'tts': (TTS, 'tts'),
        
        # services
        'jarvis_asr' : (ASR, 'asr')
    }

    config = load_resource(resource, None, *args, **kwargs)
    
    if 'type' not in config:
        raise ValueError(f"'type' setting missing from config '{config.path}'")
        
    if config.type not in type_map:
        raise ValueError(f"'{config.path}' has invalid 'type' ({config.type})")
    
    if domain:
        if type_map[config.type][1] != domain.lower():
            raise ValueError(f"invalid model selected - '{config.path}' has domain '{type_map[config.type][1]}', but AutoModel() was called with domain={domain}")
            
    return type_map[config.type][0](config, *args, **kwargs)


================================================
FILE: jetson_voice/backends/onnxruntime/__init__.py
================================================
#!/usr/bin/env python3
# coding: utf-8

from .ort_model import OnnxRuntimeModel


================================================
FILE: jetson_voice/backends/onnxruntime/ort_model.py
================================================
#!/usr/bin/env python3
# coding: utf-8

import os
import logging

# for some reason if PyCUDA isn't initialized before OnnxRuntime
# and TensorRT is also used, it makes TensorRT error
import pycuda.driver as cuda
import pycuda.autoinit

import numpy as np
import onnxruntime as ort


class OnnxRuntimeModel:
    """
    Base class for OnnxRuntime models.
    """
    def __init__(self, config, *args, **kwargs):
        """
        Load an ONNX Runtime model.
        """
        self.config = config
        
        logging.info(f"loading ONNX model '{self.config.model_path}' with onnxruntime")
        self.model = ort.InferenceSession(config.model_path, providers=['CUDAExecutionProvider'])
        logging.info(f"loaded ONNX model '{self.config.model_path}' with onnxruntime")
        
        self.inputs = self.model.get_inputs()
        self.outputs = self.model.get_outputs()
        
        for idx, binding in enumerate(self.inputs):
            print('')
            print(f"input {idx} - {binding.name}")
            print(f"   shape: {binding.shape}")
            print(f"   type:  {binding.type}")
            print('')
 
    def execute(self, inputs, return_dict=False, **kwargs):
        """
        Run the DNN model in TensorRT.  The inputs are provided as numpy arrays in a list/tuple/dict.
        Note that run() doesn't perform any pre/post-processing - this is typically done in subclasses.
        
        Parameters:
          inputs (array, list[array], dict[array]) -- the network inputs as numpy array(s).
                         If there is only one input, it can be provided as a single numpy array.
                         If there are multiple inputs, they can be provided as numpy arrays in a
                         list, tuple, or dict.  Inputs in lists and tuples are assumed to be in the
                         same order as the input bindings.  Inputs in dicts should have keys with the
                         same names as the input bindings.
          return_dict (bool) -- If True, the results will be returned in a dict of numpy arrays, where the
                                keys are the names of the output binding names. By default, the results will 
                                be returned in a list of numpy arrays, in the same order as the output bindings.
          
        Returns the model output as a numpy array (if only one output), list[ndarray], or dict[ndarray].
        """
        if isinstance(inputs, np.ndarray):
            inputs = [inputs]
        
        assert len(inputs) == len(self.inputs)
        
        if isinstance(inputs, (list,tuple)):
            inputs = {self.inputs[i].name : input for i, input in enumerate(inputs)}
        elif not isinstance(inputs, dict):        
            raise ValueError(f"inputs must be a list, tuple, or dict (instead got type '{type(inputs).__name__}')")
            
        outputs = self.model.run(None, inputs)
        
        if return_dict:
            return {self.outputs[i].name : output for i, output in enumerate(outputs)}
            
        if len(outputs) == 1:
            return outputs[0]
        
        return outputs

================================================
FILE: jetson_voice/backends/riva/__init__.py
================================================
#!/usr/bin/env python3
# coding: utf-8

from .riva_asr import RivaASRService
from .riva_tts import RivaTTSService


================================================
FILE: jetson_voice/backends/riva/riva_asr.py
================================================
#!/usr/bin/env python3
# coding: utf-8

import os
import grpc
import queue
import threading
import logging

import riva_api.audio_pb2 as ra
import riva_api.riva_asr_pb2 as rasr
import riva_api.riva_asr_pb2_grpc as rasr_srv

from jetson_voice import ASRService
from jetson_voice.utils import audio_to_int16

    
class RivaASRService(ASRService):
    """
    Riva streaming ASR service.  
    """
    def __init__(self, config, *args, **kwargs):
        """
        Open a streaming channel to the Riva server for ASR.  This establishes a connection over GRPC 
        and sends/recieves the requests and responses asynchronously.  Incoming audio samples get put
        into a request queue that GRPC picks up, and a thread waits on responses to come in.
        """
        super(RivaASRService, self).__init__(config, *args, **kwargs)
        
        self.config.setdefault('server', 'localhost:50051')
        self.config.setdefault('sample_rate', 16000)
        self.config.setdefault('frame_length', 1.0)
        self.config.setdefault('request_timeout', 2.0)      # how long to wait for new audio to come in
        self.config.setdefault('response_timeout', 0.05)    # how long to wait for results from riva
        self.config.setdefault('language_code', 'en-US')
        self.config.setdefault('enable_automatic_punctuation', True)
        self.config.setdefault('top_k', 1)

        logging.info(f'Riva ASR service config:\n{self.config}')
        
        self.channel = grpc.insecure_channel(self.config.server)
        self.client = rasr_srv.RivaSpeechRecognitionStub(self.channel)
        
        self.recognition_config = rasr.RecognitionConfig(
            encoding = ra.AudioEncoding.LINEAR_PCM,
            sample_rate_hertz = self.config.sample_rate,
            language_code = self.config.language_code,
            max_alternatives = self.config.top_k,
            enable_word_time_offsets = True,
            enable_automatic_punctuation = self.config.enable_automatic_punctuation
        )

        self.streaming_config = rasr.StreamingRecognitionConfig(
            config = self.recognition_config,
            interim_results = True
        )
        
        self.request_queue = queue.Queue()
        self.request_queue.put(rasr.StreamingRecognizeRequest(streaming_config=self.streaming_config))
         
        self.responses = self.client.StreamingRecognize(self)
        self.responses_queue = queue.Queue()
        
        self.response_thread = threading.Thread(target=self.recieve_responses)
        self.response_thread.start()

    def __call__(self, samples):
        """
        Transcribe streaming audio samples to text, returning the running phrase.
        Phrases are broken up when a break in the audio is detected (i.e. end of sentence)
        
        Parameters:
          samples (array) -- Numpy array of audio samples.

        Returns a list[dict] of the running transcripts with the following keys:
        
          text (string) -- the transcript of the current sentence
          words (list[dict]) -- a list of word dicts that make up the sentence
          end (bool) -- if true, end-of-sentence due to silence
          
        Each transcript represents one phrase/sentence.  When a sentence has been determined
        to be ended, it will be marked with end=True.  Multiple sentence transcripts can be 
        returned if one just ended and another is beginning. 
        """
        samples = audio_to_int16(samples)

        self.request_queue.put(rasr.StreamingRecognizeRequest(audio_content=samples.tobytes()))
        
        transcripts = []
        
        while True:
            try:
                transcripts.append(self.responses_queue.get(block=True, timeout=self.config.response_timeout))
            except queue.Empty:
                break

        return transcripts
 
    def __next__(self):
        """
        Retrieve the next request containing audio samples to send to the Riva server.
        This is implemented using an iterator interface as that is what GRPC expects.
        """
        try:
            request = self.request_queue.get(block=True, timeout=self.config.request_timeout)
            return request
        except queue.Empty:
            logging.debug(f'{self.config.request_timeout} second timeout occurred waiting for audio samples, stopping Riva ASR service')
            raise StopIteration
        
    def recieve_responses(self):
        """
        Wait to recieve responses from the Riva server and parse them.
        """
        logging.debug('starting Riva ASR service response reciever thread')
        
        for response in self.responses:  # this is blocking
            if not response.results:
                continue

            result = response.results[0]

            if not result.alternatives:
                continue

            text = result.alternatives[0].transcript
            text = text.strip()
            
            if len(text) == 0:
                continue
                
            self.responses_queue.put({
                'text' : text,
                'end' : result.is_final
            })

        logging.debug('exiting Riva ASR service response reciever thread')
        
    @property
    def sample_rate(self):
        """
        The sample rate that the model runs at (in Hz)
        Input audio should be resampled to this rate.
        """
        return self.config.sample_rate
    
    @property
    def frame_length(self):
        """
        Duration in seconds per frame / chunk.
        """
        return self.config.frame_length
        
    @property
    def chunk_size(self):
        """
        Number of samples per frame/chunk (equal to frame_length * sample_rate)
        """
        return int(self.frame_length * self.sample_rate)


================================================
FILE: jetson_voice/backends/riva/riva_tts.py
================================================
#!/usr/bin/env python3
# coding: utf-8

import os
import grpc
import logging
import numpy as np

import riva_api.audio_pb2 as ra
import riva_api.riva_tts_pb2 as rtts
import riva_api.riva_tts_pb2_grpc as rtts_srv

from jetson_voice import TTSService

    
class RivaTTSService(TTSService):
    """
    Riva streaming TTS service.  
    """
    def __init__(self, config, *args, **kwargs):
        """
        Open a streaming channel to the Riva server for TTS.  This establishes a connection over GRPC 
        and sends/recieves the requests and responses.
        """
        super(RivaTTSService, self).__init__(config, *args, **kwargs)
        
        self.config.setdefault('server', 'localhost:50051')
        self.config.setdefault('sample_rate', 22050)        # ignored (will always be 22.05KHz)
        self.config.setdefault('voice_name', 'ljspeech')    # ignored
        self.config.setdefault('language_code', 'en-US')

        logging.info(f'Riva TTS service config:\n{self.config}')
        
        self.channel = grpc.insecure_channel(self.config.server)
        self.client = rtts_srv.RivaSpeechSynthesisStub(self.channel)

    def __call__(self, text):
        """
        Generate audio from text.
        
        Parameters:
          text (string) -- The phrase to convert to audio.

        Returns audio samples in a numpy array.
        """
        req = rtts.SynthesizeSpeechRequest()
        
        req.text = text
        req.language_code = self.config.language_code
        req.sample_rate_hz = self.config.sample_rate
        req.voice_name = self.config.voice_name
        req.encoding = ra.AudioEncoding.LINEAR_PCM

        resp = self.client.Synthesize(req)
        
        samples = np.frombuffer(resp.audio, dtype=np.float32)
        return samples
    
    @property
    def sample_rate(self):
        """
        Get the output sample rate (in Hz)
        """
        return self.config.sample_rate

================================================
FILE: jetson_voice/backends/tensorrt/__init__.py
================================================
#!/usr/bin/env python3
# coding: utf-8

from .trt_model import TRTModel


================================================
FILE: jetson_voice/backends/tensorrt/trt_binding.py
================================================
#!/usr/bin/env python3
# coding: utf-8

import logging
import tensorrt as trt

import pycuda.driver as cuda
import pycuda.autoinit


class Binding:
    """
    Represents an input/output tensor to the model.
    """
    def __init__(self, model, index):
        """
        Parameters:
          model (TRTModel) -- parent model instance
          index (int) -- index of the binding in the model
        """
        self.model = model
        self.index = index

        self.name  = model.trt_engine.get_binding_name(index)
        self.shape = tuple(model.trt_engine.get_binding_shape(index))
        self.dtype = model.trt_engine.get_binding_dtype(index)
        self.input = model.trt_engine.binding_is_input(index)
        self.size  = max(trt.volume(self.shape) * self.dtype.itemsize, 0)
        
        self.dynamic = (self.size <= 0)   
        self.profiles = []
            
        if self.input:
            for i in range(model.trt_engine.num_optimization_profiles):
                profile = model.trt_engine.get_profile_shape(i, index)
                self.profiles.append(dict(
                    min = profile[0],
                    opt = profile[1],
                    max = profile[2]))
        
        self.alloc()
          
    def alloc(self, shape=None):
        """
        Allocate memory for the binding. alloc() is called automatically when needed.
        If new shape is provided, it will update the internal state. 
        """
        if shape is not None:
            self.shape = shape
            
        self.size = trt.volume(self.shape) * self.dtype.itemsize
        
        if self.size <= 0:  # dynamic with shape not yet set
            self.host = None
            self.device = None
            return
            
        self.host = None if self.input else cuda.pagelocked_empty(self.shape, dtype=trt.nptype(self.dtype))
        self.device = cuda.mem_alloc(self.size)
        
    def set_shape(self, shape):
        """
        Set the shape of a dynamic input binding.
        """
        if not self.dynamic:
            raise ValueError(f"binding '{self.name}' is not dynamic")
            
        if not self.input:
            raise ValueError(f"binding '{self.name}' is not an input")
            
        # check to see if the shape already matches
        if self.shape == shape:
            logging.debug(f"binding '{self.name}' already has shape {shape}")
            return
            
        logging.debug(f"binding '{self.name}' has new shape {shape}")
        
        # set the new shape
        if not self.model.trt_context.set_binding_shape(self.index, shape):
            raise ValueError(f"failed to set binding '{self.name}' with shape {shape}")
           
        # re-allocate tensor memory
        self.alloc(shape)
    
    def query_shape(self):
        """
        Updates the shape of a dynamic output binding.
        """
        if not self.dynamic:
            return
            
        if self.input:
            raise ValueError(f"binding '{self.name}' is not an output")
        
        # get the new shape
        shape = tuple(self.model.trt_context.get_binding_shape(self.index))
        
        # check to see if the shape already matches
        if self.shape == shape:
            logging.debug(f"binding '{self.name}' already has shape {shape}")
            return
        
        logging.debug(f"binding '{self.name}' has new output shape {shape}")
        
        # re-allocate tensor memory
        self.alloc(shape)
        return shape
        
    def __str__(self):
        return (
            f"binding {self.index} - '{self.name}'\n"
            f"   input:    {self.input}\n"
            f"   shape:    {self.shape}\n"
            f"   dtype:    {self.dtype}\n"
            f"   size:     {self.size}\n"
            f"   dynamic:  {self.dynamic}\n"
            f"   profiles: {self.profiles}\n"
        )

================================================
FILE: jetson_voice/backends/tensorrt/trt_builder.py
================================================
#!/usr/bin/env python3
# coding: utf-8

import os
import time
import json
import logging
import tensorrt as trt

import pycuda.driver as cuda
import pycuda.autoinit

TRT_LOGGER = trt.Logger(trt.Logger.VERBOSE)

def build_engine(config, 
                 output=None, 
                 precision='fp16',
                 batch_size=1,
                 dynamic_shapes=None,
                 workspace=128, 
                 parse_only=False):
    """
    Build TensorRT engine from ONNX model.
    
    Parameters:
      model (string) -- path to ONNX model
      config (string) -- path to model configuration json (will be inferred from model path if empty)
      output (string) -- path to output serialized TensorRT engine (will be inferred from model path if empty)
      precision (string) -- fp32 or fp16 (int8 not currently supported)
      batch_size (int) -- the maximum batch size (default 1)
      dynamic_shape (dict) -- dynamic shape profiles for min/max/opt
      workspace (int) -- builder workspace memory size (in MB)
      parse_only (bool) -- if true, test parsing the model before exiting without building the TensorRT engine
      
    Returns the built TensorRT engine (ICudaEngine)
    """
    # set default output path
    if output is None or output == '':
        output = f'{os.path.splitext(config.model_path)[0]}.engine'

    # create TensorRT resources
    builder = trt.Builder(TRT_LOGGER)
    builder_config = builder.create_builder_config()
    network = builder.create_network(1 << (int)(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH))
    parser = trt.OnnxParser(network, TRT_LOGGER)
    
    builder_config.max_workspace_size = workspace * 1 << 20
    
    # set precision
    precision = precision.lower()
    
    if precision == 'fp16':
        builder_config.set_flag(trt.BuilderFlag.FP16)
        logging.info(f'enabled FP16 precision')
    elif precision == 'int8':
        # https://github.com/NVIDIA/TensorRT/blob/d7baf010e4396c87d58e4d8a33052c01c2d89325/demo/BERT/builder.py#L592
        raise NotImplementedError('INT8 support not yet implemented')
        
    # load the model (from ONNX)
    logging.info(f'loading {config.model_path}')
    
    with open(config.model_path, 'rb') as model_file:
        if not parser.parse(model_file.read()):
            logging.error(f'failed to parse ONNX model {config.model_path}')
            for error in range(parser.num_errors): 
                print (parser.get_error(error))
            return None 

    # create dynamic shape profile
    # TODO refactor this to an abstract .get_dynamic_shapes() implementation in each subclass
    # TODO this currently uses same shape for all inputs - allow for different shape profiles
    profile = builder.create_optimization_profile()
    opt_shape = None
    
    """
    if model_type == 'qa' or model_type == 'text_classification' or model_type == 'token_classification':
        min_shape = (1, 1)  # (batch_size, sequence_length)
        max_shape = (batch_size, model_config['dataset']['max_seq_length'])
    elif model_type == 'intent_slot':
        min_shape = (1, 1)  # (batch_size, sequence_length)
        max_shape = (batch_size, model_config['language_model']['max_seq_length'])
    elif model_type == 'asr':
        features = model_config['preprocessor']['features']
        sample_rate = model_config['preprocessor']['sample_rate']
        sample_to_fft = 1.0 / 160.0  # rough conversion from samples to MEL spectrogram dims
        sample_multiplier = sample_rate * sample_to_fft
        
        min_shape = (batch_size, features, int(0.5 * sample_multiplier))  # minimum plausible frame length
        opt_shape = (batch_size, features, int(1.2 * sample_multiplier))  # default of .1s overlap factor (1,64,121)
        max_shape = (batch_size, features, int(3.0 * sample_multiplier))  # enough for 1s overlap factor
    elif model_type == 'asr_classification':
        features = model_config['preprocessor']['n_mels']
        sample_rate = model_config['sample_rate']
        sample_to_fft = 1.0 / 160.0  # rough conversion from samples to MEL spectrogram dims
        sample_multiplier = sample_rate * sample_to_fft
        
        min_shape = (batch_size, features, int(0.5 * sample_multiplier))  # minimum plausible frame length
        opt_shape = (batch_size, features, int(1.2 * sample_multiplier))  # default of .1s overlap factor (1,64,121)
        max_shape = (batch_size, features, int(3.0 * sample_multiplier))  # enough for 1s overlap factor
    elif model_type == 'tts_vocoder':
        min_shape = (batch_size, model_config['features'], 1)
        opt_shape = (batch_size, model_config['features'], 160)  # ~5-6 words
        max_shape = (batch_size, model_config['features'], 512)  # ~15-20 words?
    else:
        raise NotImplementedError(f"model type '{model_type}' is unrecognized or not supported")
    """           
    
    # TODO support different shape profiles for different input tensors
    if dynamic_shapes is not None:        
        if 'min' not in dynamic_shapes:
            dynamic_shapes['min'] = dynamic_shapes['max']
            
        if 'opt' not in dynamic_shapes:
            dynamic_shapes['opt'] = dynamic_shapes['max']
            
        for i in range(network.num_inputs):  # TODO confirm that input is in fact dynamic
            profile.set_shape(network.get_input(i).name, min=dynamic_shapes['min'], opt=dynamic_shapes['opt'], max=dynamic_shapes['max'])

        builder_config.add_optimization_profile(profile)
                    
    def print_summary():
        print('')
        print('----------------------------------------------------')
        print(' BUILDER CONFIGURATION')
        print('----------------------------------------------------')
        print(f'  - model     {config.model_path}')
        print(f'  - config    {config.path}')
        print(f'  - output    {output}')
        print(f'  - type      {config.type}')
        print(f'  - layers    {network.num_layers}')
        print(f'  - inputs    {network.num_inputs}')
        print(f'  - outputs   {network.num_outputs}')
        print(f'  - precision {precision}')
        print(f'  - workspace {workspace}')
        print('')
        
        for i in range(network.num_inputs):
            tensor = network.get_input(i)
            
            print(f'  - input {i}:')
            print(f'      - name     {tensor.name}')
            print(f'      - shape    {tensor.shape}')
            print(f'      - dtype    {tensor.dtype}')
            
        for i in range(network.num_outputs):
            tensor = network.get_output(i)
            
            print(f'  - output {i}:')
            print(f'      - name     {tensor.name}')
            print(f'      - shape    {tensor.shape}')
            print(f'      - dtype    {tensor.dtype}')
           
    print_summary()
    
    if parse_only:
        return None
    
    # build the engine
    build_start_time = time.time()
    
    engine = builder.build_engine(network, builder_config)
    
    if engine is None:
        raise ValueError(f"failed to build TensorRT engine for '{config.model_path}'")
        
    build_time_elapsed = (time.time() - build_start_time)
    print(f'\nbuilt engine in {build_time_elapsed} seconds')

    print_summary()
    
    # save engine
    print('\nserializing engine...')
    serialized_engine = engine.serialize()
    with open(output, "wb") as engine_file:
        engine_file.write(serialized_engine)
    print(f'saved engine to {output}')
        
    return engine
        

'''
if __name__ == "__main__":

    import argparse

    parser = argparse.ArgumentParser()
    
    parser.add_argument('--config', default='', type=str)
    parser.add_argument('--output', default='', type=str)
    parser.add_argument('--precision', default='fp16', choices=['fp32', 'fp16', 'int8'], type=str)
    parser.add_argument('--batch-size', default=1, type=int) # max batch size
    parser.add_argument('--workspace', default=utils.DEFAULT_WORKSPACE, type=int)
    parser.add_argument('--parse-only', action='store_true')
    
    args = parser.parse_args()
    print(args)
    
    build_engine(config=args.config,
                 output=args.output,
                 precision=args.precision,
                 batch_size=args.batch_size,
                 workspace=args.workspace,
                 parse_only=args.parse_only)
'''


================================================
FILE: jetson_voice/backends/tensorrt/trt_model.py
================================================
#!/usr/bin/env python3
# coding: utf-8

import os
import time
import json
import logging
import pprint

import numpy as np
import tensorrt as trt

import pycuda.driver as cuda
import pycuda.autoinit

from .trt_builder import build_engine, TRT_LOGGER
from .trt_binding import Binding


class TRTModel:
    """
    Base class for TensorRT models.
    """
    def __init__(self, config, dynamic_shapes=None, *args, **kwargs):
        """
        Load a TensorRT model from ONNX or serialized TensorRT engine.
        
        Parameters:
          config (ConfigDict) -- configuration dict
          dynamic_shapes (dict) -- dynamic shape profiles for min/max/opt
        """
        self.config = config
            
        # determine if the TensorRT engine already exists
        model_root, model_ext = os.path.splitext(self.config.model_path)
        model_ext = model_ext.lower()
        
        if model_ext == '.onnx':
            engine_path = model_root + '.engine'
            if os.path.exists(engine_path):
                logging.info(f'loading cached TensorRT engine from {engine_path}')
                self.config.model_path = engine_path
                model_ext = '.engine'
                
        # either build or load TensorRT engine
        if model_ext == '.onnx':
            self.trt_engine = build_engine(self.config, dynamic_shapes=dynamic_shapes)
        elif model_ext == '.engine' or model_ext == '.plan':
            with open(self.config.model_path, 'rb') as f:
                self.trt_runtime = trt.Runtime(TRT_LOGGER)
                self.trt_engine  = self.trt_runtime.deserialize_cuda_engine(f.read())
        else:
            raise ValueError(f"invalid model extension '{model_ext}' (should be .onnx, .engine, or .plan)")
            
        if self.trt_engine is None:
            raise IOError(f'failed to load TensorRT engine from {self.model_path}')
                
        self.trt_context = self.trt_engine.create_execution_context()
        logging.info(f'loaded TensorRT engine from {self.config.model_path}')

        # create a stream in which to copy inputs/outputs and run inference
        self.stream = cuda.Stream()
        
        # enumerate bindings
        self.bindings = []
        self.inputs  = []
        self.outputs = []

        for i in range(len(self.trt_engine)):
            binding = Binding(self, i)
            self.bindings.append(binding)
            
            if binding.input:
                self.inputs.append(binding)
            else:
                self.outputs.append(binding)
        
        for binding in self.bindings:
            print(f'\n{binding}')

    def execute(self, inputs, sync=True, return_dict=False, **kwargs):
        """
        Run the DNN model in TensorRT.  The inputs are provided as numpy arrays in a list/tuple/dict.
        Note that run() doesn't perform any pre/post-processing - this is typically done in subclasses.
        
        Parameters:
          inputs (array, list[array], dict[array]) -- the network inputs as numpy array(s).
                         If there is only one input, it can be provided as a single numpy array.
                         If there are multiple inputs, they can be provided as numpy arrays in a
                         list, tuple, or dict.  Inputs in lists and tuples are assumed to be in the
                         same order as the input bindings.  Inputs in dicts should have keys with the
                         same names as the input bindings.
          sync (bool) -- If True (default), will wait for the GPU to be done processing before returning.
          return_dict (bool) -- If True, the results will be returned in a dict of numpy arrays, where the
                                keys are the names of the output binding names. By default, the results will 
                                be returned in a list of numpy arrays, in the same order as the output bindings.
          
        Returns the model output as a numpy array (if only one output), list[ndarray], or dict[ndarray].
        """
        if isinstance(inputs, np.ndarray):
            inputs = [inputs]
        
        assert len(inputs) == len(self.inputs)
        
        # setup inputs + copy to GPU
        def setup_binding(binding, input):
            input = input.astype(trt.nptype(binding.dtype), copy=False)
            if binding.dynamic: 
                binding.set_shape(input.shape)
            cuda.memcpy_htod_async(binding.device, np.ascontiguousarray(input), self.stream)
            
        if isinstance(inputs, (list,tuple)):
            for idx, input in enumerate(inputs):
                setup_binding(self.bindings[idx], input)
        elif isinstance(inputs, dict):        
            for binding_name in inputs:
                setup_binding(self.find_binding(binding_name), inputs[binding_name])
        else:
            raise ValueError(f"inputs must be a list, tuple, or dict (instead got type '{type(inputs).__name__}')")
            
        assert self.trt_context.all_binding_shapes_specified
        assert self.trt_context.all_shape_inputs_specified 
        
        # query new dynamic output shapes
        for output in self.outputs:
            output.query_shape()

        # run inference
        self.trt_context.execute_async_v2(
            bindings=[int(binding.device) for binding in self.bindings], 
            stream_handle=self.stream.handle
        )
          
        # copy outputs to CPU
        for output in self.outputs:
            cuda.memcpy_dtoh_async(output.host, output.device, self.stream)
          
        # wait for completion
        if sync:
            self.stream.synchronize()
            
        # return results
        if return_dict:
            results = {}
            for output in self.outputs:
                results[output.name] = output.host
            return results
        else:
            if len(self.outputs) == 1:
                return self.outputs[0].host
            else:
                return tuple([output.host for output in self.outputs])

    def find_binding(self, name):
        """
        Lookup an input/output binding by name
        """
        for binding in self.bindings:
            if binding.name == name: 
                return binding   
        logging.error(f"couldn't find binding with name '{name}'")
        return None
        
    def set_shape(self, binding, shape):
        """
        Set the shape of a dynamic binding.
        """
        if isinstance(binding, int):
            binding = self.bindings[binding]
        elif isinstance(binding, str):
            binding = self.find_binding(binding)
        elif not isinstance(binding, dict):
            raise ValueError(f'binding must be specified as int, string, or dict (got {type(binding).__name__})')
            
        binding.set_shape(shape)
    

================================================
FILE: jetson_voice/models/__init__.py
================================================
#!/usr/bin/env python3
# coding: utf-8

from .asr import ASREngine
from .nlp import IntentSlotEngine, QuestionAnswerEngine, TextClassificationEngine, TokenClassificationEngine
from .tts import TTSEngine

================================================
FILE: jetson_voice/models/asr/__init__.py
================================================
#!/usr/bin/env python3
# coding: utf-8

from .asr_engine import ASREngine


================================================
FILE: jetson_voice/models/asr/asr_engine.py
================================================
#!/usr/bin/env python3
# coding: utf-8

import os
import time
import pprint
import logging
import importlib

import torch
import numpy as np

from .ctc_decoder import CTCDecoder

from jetson_voice.asr import ASRService
from jetson_voice.utils import audio_to_float, global_config, load_model, softmax

      
class ASREngine(ASRService):
    """
    Streaming ASR (Automatic Speech Recognition) model in TensorRT or onnxruntime.
    This model is primarily designed to be used on a live audio source like a microphone.
    """
    def __init__(self, config, *args, **kwargs):
        """
        Loads a streaming ASR model from ONNX or serialized TensorRT engine.
        
        Parameters:
          model (string) -- path to ONNX model or serialized TensorRT engine/plan
          config (string) -- path to model configuration json (will be inferred from model path if empty)
        """
        super(ASREngine, self).__init__(config, *args, **kwargs)

        if self.config.type != 'asr' and self.config.type != 'asr_classification':
            raise ValueError(f"{self.config.model_path} isn't an ASR model (type '{self.config.type}'")

        # set some default config options that are non-standard in nemo
        if 'streaming' not in self.config:
            self.config['streaming'] = {}
        
        self.config['streaming'].setdefault('frame_length', 1.0)     # duration of signal frame, seconds (TODO shorter defaults for VAD/command classifiers)
        self.config['streaming'].setdefault('frame_overlap', 0.5)    # duration of overlap before/after current frame, seconds
        
        # some config changes for streaming
        if not self.classification:
            self.config['preprocessor']['dither'] = 0.0
            self.config['preprocessor']['pad_to'] = 0
        
            if 'ctc_decoder' not in self.config:
                self.config['ctc_decoder'] = {}
                    
            self.config['ctc_decoder'].setdefault('type', 'greedy')        # greedy or beamsearch
            self.config['ctc_decoder'].setdefault('add_punctuation', True) # add period to the end of sentences
        
            if 'add_punctuation' in kwargs:
                self.config['ctc_decoder']['add_punctuation'] = kwargs['add_punctuation']
                logging.info(f"add_punctuation = {kwargs['add_punctuation']}")
                
        if not self.classification and self.config['preprocessor']['features'] == 64:   # TODO normalization coefficients for citrinet (N=80)
            normalization = {}

            normalization['fixed_mean'] = [
                 -14.95827016, -12.71798736, -11.76067913, -10.83311182,
                 -10.6746914,  -10.15163465, -10.05378331, -9.53918999,
                 -9.41858904,  -9.23382904,  -9.46470918,  -9.56037,
                 -9.57434245,  -9.47498732,  -9.7635205,   -10.08113074,
                 -10.05454561, -9.81112681,  -9.68673603,  -9.83652977,
                 -9.90046248,  -9.85404766,  -9.92560366,  -9.95440354,
                 -10.17162966, -9.90102482,  -9.47471025,  -9.54416855,
                 -10.07109475, -9.98249912,  -9.74359465,  -9.55632283,
                 -9.23399915,  -9.36487649,  -9.81791084,  -9.56799225,
                 -9.70630899,  -9.85148006,  -9.8594418,   -10.01378735,
                 -9.98505315,  -9.62016094,  -10.342285,   -10.41070709,
                 -10.10687659, -10.14536695, -10.30828702, -10.23542833,
                 -10.88546868, -11.31723646, -11.46087382, -11.54877829,
                 -11.62400934, -11.92190509, -12.14063815, -11.65130117,
                 -11.58308531, -12.22214663, -12.42927197, -12.58039805,
                 -13.10098969, -13.14345864, -13.31835645, -14.47345634]
                 
            normalization['fixed_std'] = [
                 3.81402054, 4.12647781, 4.05007065, 3.87790987,
                 3.74721178, 3.68377423, 3.69344,    3.54001005,
                 3.59530412, 3.63752368, 3.62826417, 3.56488469,
                 3.53740577, 3.68313898, 3.67138151, 3.55707266,
                 3.54919572, 3.55721289, 3.56723346, 3.46029304,
                 3.44119672, 3.49030548, 3.39328435, 3.28244406,
                 3.28001423, 3.26744937, 3.46692348, 3.35378948,
                 2.96330901, 2.97663111, 3.04575148, 2.89717604,
                 2.95659301, 2.90181116, 2.7111687,  2.93041291,
                 2.86647897, 2.73473181, 2.71495654, 2.75543763,
                 2.79174615, 2.96076456, 2.57376336, 2.68789782,
                 2.90930817, 2.90412004, 2.76187531, 2.89905006,
                 2.65896173, 2.81032176, 2.87769857, 2.84665271,
                 2.80863137, 2.80707634, 2.83752184, 3.01914511,
                 2.92046439, 2.78461139, 2.90034605, 2.94599508,
                 2.99099718, 3.0167554,  3.04649716, 2.94116777]
                 
            self.config['preprocessor']['normalize'] = normalization
        
        # create preprocessor instance
        preprocessor_name = self.config['preprocessor']['_target_'].rsplit(".", 1)
        preprocessor_class = getattr(importlib.import_module(preprocessor_name[0]), preprocessor_name[1])
        logging.debug(f'ASR preprocessor - {preprocessor_class}')

        preprocessor_config = self.config['preprocessor'].copy()
        preprocessor_config.pop('_target_')

        self.preprocessor = preprocessor_class(**preprocessor_config)

        # load the model
        features = self.config.preprocessor.n_mels if self.classification else self.config.preprocessor.features
        time_to_fft = self.sample_rate * (1.0 / 160.0)     # rough conversion from samples to MEL spectrogram dims
        
        dynamic_shapes = {
            'min' : (1, features, int(0.1 * time_to_fft)), # minimum plausible frame length
            'opt' : (1, features, int(1.5 * time_to_fft)), # default of .5s overlap factor (1,64,121)
            'max' : (1, features, int(3.0 * time_to_fft))  # enough for 2s overlap factor
        }
        
        self.model = load_model(self.config, dynamic_shapes)
        
        # create CTC decoder
        if not self.classification:
            self.ctc_decoder = CTCDecoder.from_config(self.config['ctc_decoder'],
                                                      self.config['decoder']['vocabulary'],
                                                      os.path.dirname(self.config.model_path))
                                                      
            logging.info(f"CTC decoder type: '{self.ctc_decoder.type}'")
            
        # create streaming buffer
        self.n_frame_len = int(self.frame_length * self.sample_rate)
        self.n_frame_overlap = int(self.frame_overlap * self.sample_rate)
        
        self.buffer_length = self.n_frame_len + self.n_frame_overlap
        self.buffer_duration = self.buffer_length / self.sample_rate
        
        self.buffer = np.zeros(shape=self.buffer_length, dtype=np.float32)  # 2*self.n_frame_overlap
    
        
    def __call__(self, samples):
        """
        Transcribe streaming audio samples to text, returning the running phrase.
        Phrases are broken up when a break in the audio is detected (i.e. end of sentence)
        
        Parameters:
          samples (array) -- Numpy array of audio samples.

        Returns a dict of the running phrase.
          transcript (string) -- the current transcript
          latest (string) -- the latest additions to the transcript
          end (bool) -- if true, end-of-sequence due to silence
        """
        samples = audio_to_float(samples)
        
        if len(samples) < self.n_frame_len:
            samples = np.pad(samples, [0, self.n_frame_len - len(samples)], 'constant')
            
        self.buffer[:self.n_frame_overlap] = self.buffer[-self.n_frame_overlap:]
        self.buffer[self.n_frame_overlap:] = samples
        
        if global_config.profile: preprocess_begin = time.perf_counter()
        
        # apply pre-processing
        preprocessed_signal, _ = self.preprocessor(
            input_signal=torch.as_tensor(self.buffer, dtype=torch.float32).unsqueeze(dim=0), 
            length=torch.as_tensor(self.buffer.size, dtype=torch.int64).unsqueeze(dim=0)
        )

        if global_config.profile:
            logging.info(f'preprocess time: {time.perf_counter() - preprocess_begin}')
            network_begin = time.perf_counter()
        
        # run the asr model
        logits = self.model.execute(torch_to_numpy(preprocessed_signal))
        logits = np.squeeze(logits)
        logits = softmax(logits, axis=-1)

        if global_config.profile: logging.info(f'network time: {time.perf_counter() - network_begin}')
        
        self.timestep_duration = self.buffer_duration / logits.shape[0]
        self.n_timesteps_frame = int(self.frame_length / self.timestep_duration)
        self.n_timesteps_overlap = int(self.frame_overlap / self.timestep_duration)

        if self.classification:
            argmax = np.argmax(logits)
            prob = logits[argmax]
            return (self.config['labels'][argmax], prob)
        else:
            self.ctc_decoder.set_timestep_duration(self.timestep_duration)
            self.ctc_decoder.set_timestep_delta(self.n_timesteps_frame)

            if global_config.profile: ctc_decoder_begin = time.perf_counter()
            transcripts = self.ctc_decoder.decode(logits)
            if global_config.profile: logging.info(f'ctc_decoder time: {time.perf_counter() - ctc_decoder_begin}')
            
            return transcripts

    @property
    def classification(self):
        """
        Returns true if this is an ASR classification model.
        """
        return self.config.type == 'asr_classification'
        
    @property
    def sample_rate(self):
        """
        The sample rate that the model runs at.
        Input audio should be resampled to this rate.
        """
        return self.config['sample_rate'] if self.classification else self.config['preprocessor']['sample_rate']
        
    @property
    def frame_length(self):
        """
        Duration in seconds per frame / chunk.
        """
        return self.config['streaming']['frame_length']
        
    @property
    def frame_overlap(self):
        """
        Duration of overlap in seconds before/after current frame.
        """
        return self.config['streaming']['frame_overlap']
    
    @property
    def chunk_size(self):
        """
        Number of samples per frame/chunk (equal to frame_length * sample_rate)
        """
        return self.n_frame_len


def torch_to_numpy(tensor):
    return tensor.detach().cpu().numpy() if tensor.requires_grad else tensor.cpu().numpy()
    
                    
================================================
FILE: jetson_voice/models/asr/ctc_beamsearch.py
================================================
#!/usr/bin/env python3
# coding: utf-8

import os
import logging

from .ctc_decoder import CTCDecoder
from .ctc_utils import find_silent_intervals, merge_words, rebase_word_times, split_words, transcript_from_words

from ctc_decoders import Scorer
from swig_decoders import BeamDecoder, ctc_beam_search_decoder_ex

from jetson_voice.utils import global_config


class CTCBeamSearchDecoder(CTCDecoder):
    """
    CTC beam search decoder that optionally uses a language model.
    """
    def __init__(self, config, vocab, resource_path=None):
        """
        Create a new CTCBeamSearchDecoder.
        
        See CTCDecoder.from_config() to automatically create
        the correct type of instance dependening on config.
        """
        super().__init__(config, vocab)
        self.config.setdefault('word_threshold', -1000.0)
        self.reset()
        
        self.scorer = None    
        #self.num_cores = max(os.cpu_count(), 1)
        
        # set default config
        # https://github.com/NVIDIA/NeMo/blob/855ce265b80c0dc40f4f06ece76d2c9d6ca1be8d/nemo/collections/asr/modules/beam_search_decoder.py#L21
        self.config.setdefault('language_model', None)
        self.config.setdefault('beam_width', 32)#128)
        self.config.setdefault('alpha', 0.7 if self.language_model else 0.0)
        self.config.setdefault('beta', 0.0)
        self.config.setdefault('cutoff_prob', 1.0)
        self.config.setdefault('cutoff_top_n', 40)
        self.config.setdefault('top_k', 3)
        
        # check for language model file
        if self.language_model:
            if not os.path.isfile(self.language_model):
                self.config['language_model'] = os.path.join(resource_path, self.language_model)
                if not os.path.isfile(self.language_model):
                    raise IOError(f"language model file '{self.language_model}' does not exist")
                    
        logging.info('creating CTCBeamSearchDecoder')
        logging.info(str(self.config))
        
        # create scorer
        if self.language_model:
            self.scorer = Scorer(self.config['alpha'],
                                 self.config['beta'],
                                 model_path=self.language_model,
                                 vocabulary=self.vocab)
            
    def decode(self, logits):
        """
        Decode logits into words, and merge the new words with the
        previous words from the running transcript.
        
        Returns the running transcript as a list of word dictionaries, 
        where each word dict has he following keys:
        
           'text' (str) -- the text of the word
           'score' (float) -- the probability of the word
           'start_time' (int) -- the start time of the word (in timesteps)
           'end_time' (int) -- the end time of the word (in timesteps)
           
        Note that the start/end times are transformed from timestamps into
        seconds by the ASR engine after CTCDecoder.decode() is called.
        """
        results = ctc_beam_search_decoder_ex(
            logits.tolist(), 
            self.vocab,
            self.config['beam_width'], 
            self.config['cutoff_prob'], 
            self.config['cutoff_top_n'], 
            self.config['top_k'],
            self.timestep,
            self.scorer)
        
        
        if global_config.debug:
            print('BeamSearch results', len(results))
            for idx, result in enumerate(results):
                print(f"  beam {idx} [{result.score:.3f}] '{result.text}'")
                for word_idx, word in enumerate(result.words):
                    print(f"    word {word_idx} [{word.start_time}:{word.end_time} {word.score:.3f}] '{word.text}'")
                
        words = [{
            'text' : word.text,
            'score' : word.score,
            'start_time' : word.start_time,
            'end_time' : word.end_time
        } for word in results[0].words]
        
        # merge new words with past words
        self.words = merge_words(self.words, words, self.config['word_threshold'], 'similarity')
        
        # look for silent/EOS intervals
        silent_intervals = find_silent_intervals(logits, len(self.vocab), self.timesteps_silence, self.timestep) 
        
        if global_config.debug: 
            print(f'silent intervals:  {silent_intervals}')

        self.timestep += self.timestep_delta
        
        # split the words at EOS intervals
        if len(silent_intervals) > 0:
            wordlists = split_words(self.words, silent_intervals)
            transcripts = []
            
            for idx, wordlist in enumerate(wordlists):
                # ignore blanks (silence after EOS has already occurred)
                if len(wordlist) == 0:
                    continue
                    
                # if there is only one wordlist, then it must be EOS
                # if there are multiple, then the last one is not EOS
                end = (len(wordlists) == 1) or (idx < (len(wordlists) - 1))
                
                if end:
                    wordlist = rebase_word_times(wordlist)
                    self.reset()            # TODO reset timesteps counter correctly
                else:
                    self.words = wordlist   
                    
                transcripts.append((wordlist, end))
        else:
            transcripts = [(self.words, False)]

        return [{
            'text' : transcript_from_words(words, scores=global_config.debug, times=global_config.debug, end=end, add_punctuation=self.config['add_punctuation']),
            'words' : words,
            'end' : end
        } for words, end in transcripts]
        
    def reset(self):
        """
        Reset the CTC decoder state at EOS (end of sentence)
        """
        #self.timestep = 0
        #self.tail_silence = 0
        self.words = []
        
    @property
    def language_model(self):
        return self.config['language_model']
 

================================================
FILE: jetson_voice/models/asr/ctc_decoder.py
================================================
#!/usr/bin/env python3
# coding: utf-8

        
class CTCDecoder:
    """
    CTC decoder base class for ASR.
    """    
    @staticmethod
    def from_config(config, vocab, resource_path=None):
        """
        Static factory function to instantiate the correct
        CTC decoder instance type from the config.
        
           config['type'] == 'greedy' -> CTCGreedyDecoder
           config['type'] == 'beamsearch' -> CTCBeamSearchDecoder
        """
        type = config['type'].lower()
        
        if type == 'greedy':
            from .ctc_greedy import CTCGreedyDecoder
            return CTCGreedyDecoder(config, vocab)
        elif type == "beamsearch":
            from .ctc_beamsearch import CTCBeamSearchDecoder
            return CTCBeamSearchDecoder(config, vocab, resource_path)
        else:
            raise ValueError(f"invalid/unrecognized CTC decoder type '{type}'")
            
    def __init__(self, config, vocab):
        """
        See CTCDecoder.from_config() to automatically create
        the correct type of instance dependening on config.
        """
        self.config = config
        self.vocab = vocab
        self.timestep = 0
        
        self.config.setdefault('vad_eos_duration', 0.65)  # max silent time until end-of-sentence
        self.config.setdefault('timestep_offset', 5)      # number of symbols to drop for smooth streaming
        
    def decode(self, logits):
        """
        Decode logits into words, and merge the new words with the
        previous words from the running transcript.
        
        Returns the running transcript as a list of word dictionaries, 
        where each word dict has he following keys:
        
           'text' (str) -- the text of the word
           'score' (float) -- the probability of the word
           'start_time' (int) -- the start time of the word (in timesteps)
           'end_time' (int) -- the end time of the word (in timesteps)
           
        Note that the start/end times are transformed from timestamps into
        seconds by the ASR engine after CTCDecoder.decode() is called.
        """
        pass
        
    def reset(self):
        """
        Reset the CTC decoder state at EOS (end of sentence)
        """
        pass

    def set_timestep(self, timestep):
        """
        Set the current timestep.
        """
        self.timestep = timestep
    
    def set_timestep_delta(self, offset):
        """
        Set the number of timesteps per frame.
        """
        self.timestep_delta = offset - self.config['timestep_offset']
        
    def set_timestep_duration(self, duration):
        """
        Set the duration of each timestep, in seconds.
        """
        self.timestep_duration = duration
        self.timesteps_silence = self.config['vad_eos_duration'] / self.timestep_duration
             
    @property
    def type(self):
        """
        Return the CTC decoder type string ('greedy' or 'beamsearch')
        """
        return self.config['type'].lower() 
        
 
================================================
FILE: jetson_voice/models/asr/ctc_greedy.py
================================================
#!/usr/bin/env python3
# coding: utf-8

import string
import numpy as np

from .ctc_decoder import CTCDecoder
from .ctc_utils import merge_words, transcript_from_words

from jetson_voice.utils import global_config


class CTCGreedyDecoder(CTCDecoder):
    """
    CTC greedy decoder that simply chooses the highest-probability logits.
    """
    def __init__(self, config, vocab):
        """
        Create a new CTCGreedyDecoder.
        TODO document config.
        
        See CTCDecoder.from_config() to automatically create
        the correct type of instance dependening on config.
        """
        super().__init__(config, vocab)
        
        self.config.setdefault('word_threshold', 0.1)
        
        # add blank symbol to vocabulary
        if '_' not in vocab:
            self.vocab = vocab.copy()
            self.vocab.append('_')
            
        self.reset()
        
    def decode(self, logits):
        """
        Decode logits into words, and merge the new words with the
        previous words from the running transcript.
        
        Returns the running transcript as a list of word dictionaries, 
        where each word dict has he following keys:
        
           'text' (str) -- the text of the word
           'score' (float) -- the probability of the word
           'start_time' (int) -- the start time of the word (in timesteps)
           'end_time' (int) -- the end time of the word (in timesteps)
           
        Note that the start/end times are transformed from timestamps into
        seconds by the ASR engine after CTCDecoder.decode() is called.
        """
        text = []
        prob = 1.0
        probs = []
        
        # select the chars with the max probability
        for i in range(logits.shape[0]):
            argmax = np.argmax(logits[i])
            text.append(self.vocab[argmax])
            probs.append(logits[i][argmax])
              
        if global_config.debug:
            print(text)
            
        # get the max number of sequential silent timesteps (continuing from last frame)
        silent_timesteps = self.end_silent_timesteps
        max_silent_timesteps = 0
        
        for i in range(len(text)):
            if text[i] == '_':
                silent_timesteps += 1
            else:
                max_silent_timesteps = max(silent_timesteps, max_silent_timesteps) if i > 0 else 0
                silent_timesteps = 0
        
        if text[-1] == '_':
            self.end_silent_timesteps = silent_timesteps
           
        # merge repeating chars and blank symbols
        _, words = self.merge_chars(text, probs)  #text[:len(text)-self.config['offset']]
        
        # merge new words with past words
        words = merge_words(self.words, words, self.config['word_threshold'], 'overlap')
        
        # increment timestep (after this frame's timestep is done being used, and before a potential EOS reset)
        self.timestep += self.timestep_delta
        
        # check for EOS
        end = False
        
        if silent_timesteps > self.timesteps_silence:
            end = True
            self.reset()
        else:
            self.words = words
            
        return [{
            'text' : transcript_from_words(words, scores=global_config.debug, times=global_config.debug, end=end, add_punctuation=self.config['add_punctuation']),
            'words' : words,
            'end' : end
        }]
           
    def merge_chars(self, text, probs):
        """
        Merge repeating chars and blank symbols into words.
        """
        text_merged = ''
        
        word = None
        words = []

        def ispunct(ch):
            return ch in (string.punctuation + ' ')
            
        for i in range(len(text)):
            if text[i] != self.prev_char and text[i] != '_':
                self.prev_char = text[i]
                
                if text[i] != '_':
                    text_merged += text[i]

                    if not ispunct(text[i]):
                        if word is None:
                            word = {
                                'text' : text[i],
                                'score' : probs[i],
                                'start_char' : len(text_merged) - 1,
                                'end_char' : len(text_merged),
                                'start_time' : self.timestep + i,
                                'end_time' : self.timestep + i + 1
                            }
                        else:
                            word['text'] += text[i]
                            word['score'] *= probs[i]
                            word['end_char'] = len(text_merged)
                            word['end_time'] = self.timestep + i + 1
    
                if ispunct(text[i]) and word is not None:
                    words.append(word)
                    word = None
            
        if word is not None:
            words.append(word)
                
        return text_merged, words
        
    def reset(self):
        """
        Reset the CTC decoder state at EOS (end of sentence)
        """
        self.prev_char = ''
        self.end_silent_timesteps = 0
        self.timestep = 0
        self.words = []

 
================================================
FILE: jetson_voice/models/asr/ctc_utils.py
================================================
#!/usr/bin/env python3
# coding: utf-8

import nltk
import numpy as np

from jetson_voice.utils import global_config


def transcript_from_words(words, scores=False, times=False, end=False, add_punctuation=True):
    """
    Convert a list of words to the text transcript.
    """
    transcript = ''
    
    for idx, word in enumerate(words):
    
        if scores and times:
            transcript += f"{word['text']} ({word['start_time']}:{word['end_time']} {word['score']:.2f})"
        elif scores:
            transcript += f"{word['text']} ({word['score']:.2f})"
        elif times:
            transcript += f"{word['text']} ({word['start_time']}:{word['end_time']})"
        else:
            transcript += word['text']
        
        if idx < len(words) - 1:
            transcript += ' '
      
    if end and add_punctuation:
        transcript += '.'  # add punctuation to end
      
    return transcript
        

def find_overlapping_word(wordlist, word):
    """
    Find the first word from the list with overlapping times.
    Returns a (word, index) tuple or (None, -1) if no overlap found.
    """
    for idx, word2 in enumerate(wordlist):
        if not (word['end_time'] < word2['start_time'] or word['start_time'] > word2['end_time']):
            return word2, idx 
    return None, -1


def find_word_after(wordlist, time):
    """
    Find the nearest word that starts after the time.
    Returns a (word, index) tuple or (None, 1) if all words start before the time.
    """
    if isinstance(time, tuple):
        time = time[1]  # use the end time
        
    for idx, word in enumerate(wordlist):
        if time <= word['start_time']:
            return word, idx        
            
    return None, -1


def find_word_before(wordlist, time):
    """
    Find the nearest word that starts after the time.
    Returns a (word, index) tuple or (None, 1) if all words start after the time.
    """
    if isinstance(time, tuple):
        time = time[0]  # use the start time
        
    for idx in range(len(wordlist)-1, -1, -1):
        if time >= wordlist[idx]['end_time']:
            return wordlist[idx], idx    
            
    return None, -1


def merge_words(wordlist, words, score_threshold=-np.inf, method='overlap'):
    """
    Merge new words with past words.  This works by finding overlapping or similar words,
    and replacing the old word with new word if the new word has a higher probability.
    """
    if len(words) == 0:
        return wordlist
        
    if len(wordlist) == 0:
        return words
        
    # short-circuit if these are all new words    
    if words[0]['start_time'] > wordlist[-1]['end_time']:
        wordlist.extend(words)
        return wordlist
         
    if method == 'overlap':
        # find words that overlap and pick the highest-scoring one
        for word in words:
            if word['score'] < score_threshold: #self.config['word_threshold']:
                continue
                
            if len(wordlist) == 0 or word['start_time'] > wordlist[-1]['end_time']:
                wordlist.append(word)
                continue

            overlap_word, overlap_idx = find_overlapping_word(wordlist, word)
            
            if overlap_word is None:
                continue

            if global_config.debug:
                print(f"found new '{word['text']}' ({word['start_time']}:{word['end_time']} {word['score']:.2f}) overlaps with '{overlap_word['text']}' ({overlap_word['start_time']}:{overlap_word['end_time']} {overlap_word['score']:.2f})")

            if word['score'] > overlap_word['score']:
                wordlist[overlap_idx] = word
                
    elif method == 'similarity':
        # find the most-similar past word to the first new word
        similarity_metric = np.inf #1000
        similarity_index = -1
        
        for idx in range(len(wordlist)-1, -1, -1):  # search in reverse so words early in the transcript aren't matched first
            similarity = nltk.edit_distance(words[0]['text'], wordlist[idx]['text'])
            
            if similarity < similarity_metric:
                similarity_metric = similarity
                similarity_index = idx
                
            if similarity == 0:
                break
           
        if global_config.debug:
            print(f"closest word to '{words[0]['text']}' is '{wordlist[similarity_index]['text']}' (similarity={similarity_metric}) ")
        
        wordlist = wordlist[:similarity_index]
        wordlist.extend(words)
        
    else:
        raise ValueError(f"invalid method '{method}' (valid options are 'overlap', 'similarity')")
        
    return wordlist
        
        
def split_words(wordlist, times):
    """
    Split the word list by the given times.
    note - these times should be sorted
    """
    wordlists = []

    for time in times:
        _, idx = find_word_after(wordlist, time)
        
        if idx < 0:
            wordlists.append(wordlist)
            return wordlists
            
        wordlists.append(wordlist[:idx])
        wordlist = wordlist[idx:]
        
    wordlists.append(wordlist)    
    return wordlists
        
        
def rebase_word_times(wordlist):
    """
    Re-base the word timings so that the start of the first word is zero.
    """
    if len(wordlist) == 0:
        return wordlist
        
    #wordlist = wordlist.copy()
    start_offset = wordlist[0]['start_time']
            
    for idx in range(len(wordlist)):
        wordlist[idx]['start_time'] -= start_offset
        wordlist[idx]['end_time'] -= start_offset
    
    return wordlist


def find_silent_intervals(logits, blank_symbol_id, min_silent_time, time_offset):
    """
    Find blank/silent regions in the output logits.
    """
    num_timesteps = logits.shape[0]
    silent_intervals = []
    last_interval_start = None
    
    for i in range(num_timesteps):
        argmax = np.argmax(logits[i])
        
        if argmax == blank_symbol_id:
            if last_interval_start is None:
                last_interval_start = i 
        
        if last_interval_start is not None and (argmax != blank_symbol_id or (i == num_timesteps-1)):
            if i - last_interval_start >= min_silent_time:
                silent_intervals.append((last_interval_start + time_offset, i-1+time_offset))
            #    print(f'     new silent interval ({last_interval_start + self.timestep}:{i-1+self.timestep}) {i - last_interval_start} > {min_length:.2f}')  
            #else:
            #    print(f'skipping silent interval ({last_interval_start + self.timestep}:{i-1+self.timestep}) {i - last_interval_start} < {min_length:.2f}')
                
            last_interval_start = None

    return silent_intervals
        

================================================
FILE: jetson_voice/models/nlp/__init__.py
================================================
#!/usr/bin/env python3
# coding: utf-8

from .intent_slot import IntentSlotEngine
from .question_answer import QuestionAnswerEngine
from .text_classification import TextClassificationEngine
from .token_classification import TokenClassificationEngine

================================================
FILE: jetson_voice/models/nlp/intent_slot.py
================================================
#!/usr/bin/env python3
# coding: utf-8

import os
import logging
import numpy as np

from transformers import AutoTokenizer

from jetson_voice.nlp import IntentSlotService
from jetson_voice.utils import load_model, normalize_logits
from .nlp_utils import find_subtokens, nlp_dynamic_shapes


class IntentSlotEngine(IntentSlotService):
    """
    Joint Intent and Slot classification model in TensorRT / onnxruntime.
    """
    def __init__(self, config, *args, **kwargs):
        """
        Load an Intent/Slot classification model from ONNX
        """
        super(IntentSlotEngine, self).__init__(config, *args, **kwargs)

        if self.config.type != 'intent_slot':
            raise ValueError(f"{self.config.model_path} isn't an Intent/Slot model (type '{self.config.type}'")
            
        # load model
        dynamic_shapes = {'max' : (1, self.config['language_model']['max_seq_length'])}  # (batch_size, sequence_length)
        
        if nlp_dynamic_shapes:
            dynamic_shapes['min'] = (1, 1)
        
        self.model = load_model(self.config, dynamic_shapes)
        
        # create tokenizer
        self.tokenizer = AutoTokenizer.from_pretrained(self.config['tokenizer']['tokenizer_name'])
        self.null_slot = self.slot_labels[-1]  # 'O' in assistant dataset - always the last label?
        
        
    def __call__(self, query):
        """
        Perform intent/slot classification on the input query.
        
        Parameters:
          query (string) -- The text query, for example:
                             'What is the weather in San Francisco tomorrow?'

        Returns a dict with the following keys:
             'intent' (string) -- the classified intent label
             'score' (float) -- the intent probability [0,1]
             'slots' (list[dict]) -- a list of dicts, where each dict has the following keys:
                  'slot' (string) -- the slot label
                  'text' (string) -- the slot text from the query
                  'score' (float) -- the slot probability [0,1]
        """
        encodings = self.tokenizer(
            text=query,
            padding='longest' if nlp_dynamic_shapes else 'max_length',
            truncation=True,
            max_length=self.config['language_model']['max_seq_length'],
            return_tensors='np',
            return_token_type_ids=True,
            return_overflowing_tokens=True,
            return_offsets_mapping=True,
            return_special_tokens_mask=True,
        )

        # during slot classification, we want to ignore slots from subtokens and special tokens 
        subtoken_mask = find_subtokens(encodings, method='subtoken_delimiters')
        ignore_mask = subtoken_mask | encodings['special_tokens_mask']
    
        # retrieve the inputs from the encoded tokens
        inputs = {}
        
        for input in self.model.inputs:
            if input.name not in encodings:
                raise ValueError(f"the encoded inputs from the tokenizer doesn't contain '{input.name}'")

            inputs[input.name] = encodings[input.name]
                    
        # run the model
        intent_logits, slot_logits = self.model.execute(inputs)

        intent_logits = normalize_logits(intent_logits)
        slot_logits = normalize_logits(slot_logits)

        intent_preds = np.argmax(intent_logits, axis=-1)
        slot_preds = np.argmax(slot_logits, axis=-1)

        # convert numerical outputs to intent/slot labels
        results = []

        for query_idx, intent_id in enumerate(intent_preds):
            results.append({
                'intent' : self.intent_label(intent_id),
                'score' : intent_logits[query_idx][intent_id],
                'slots' : []
            })
                
        for query_idx, slots in enumerate(slot_preds):
            query_slots = [self.slot_label(slot) for slot in slots]

            for token_idx, slot in enumerate(query_slots):
                # ignore unclassified slots or masked tokens
                if slot == self.null_slot or ignore_mask[query_idx][token_idx]:
                    continue
                    
                # convert from token index back to the query string
                chars = encodings.token_to_chars(query_idx, token_idx)
                text = query[chars[0]:chars[1]]      # queries[query_idx]
                
                # append subtokens from the query to the text
                for subtoken_idx in range(token_idx+1, len(query_slots)):
                    if subtoken_mask[query_idx][subtoken_idx]:
                        subtoken_chars = encodings.token_to_chars(query_idx, subtoken_idx)
                        text += query[subtoken_chars[0]:subtoken_chars[1]]
                    else:
                        break
                        
                results[query_idx]['slots'].append({
                    'slot' : slot,
                    'text' : text,
                    'score' : slot_logits[query_idx][token_idx][slots[token_idx]]
                })
        
        if len(results) == 1:
            return results[0]
        else:
            return results
            
    @property
    def intent_labels(self):
        """
        List of the intent class labels.
        """
        return self.config['data_desc']['intent_labels']
    
    def intent_label(self, index):
        """
        Return an intent label by index (with bounds checking)
        """
        return self.intent_labels[int(index)] if index < len(self.intent_labels) else 'Unknown_Intent'
        
    @property
    def slot_labels(self):
        """
        List of the slot class labels.
        """
        return self.config['data_desc']['slot_labels']
    
    def slot_label(self, index):
        """
        Return a slot label by index (with bounds checking)
        """
        return self.slot_labels[int(index)] if index < len(self.slot_labels) else self.null_slot
        

================================================
FILE: jetson_voice/models/nlp/nlp_utils.py
================================================
#!/usr/bin/env python3
# coding: utf-8

import numpy as np


# NLP BERT models (and BERT derivatives) have myelin problem with dynamic shapes on aarch64,
# so we disable dynamic shape changing for now (shapes will be set to the max sequence length)
nlp_dynamic_shapes=False


def find_subtokens(encodings, method='char_span'):
    """
    Compute the subtoken mask, where each token is marked as True if it's a subtoken or False otherwise.
    Longer words/acronyms may be tokenized into mulitple word pieces (called subtokens), for example:
    
        'Yosemite' -> ['yo', '##se', '##mite']
        'U.S.' -> ['u', '.', 's', '.']
    
    Parameters:
      encodings (BatchEncoding) -- Output from tokenizer
      
      method (string) -- If 'char_span', the subtoken mask will be determined by looking at the character
                         indices.  Tokens that map to characters that are side-by-side are flagged as subtokens.
                         
                         If 'subtoken_delimiters', subtokens will be identified by looking for '##' symbols.
                         However this can miss punctuated subtokens, such as 'U.S.'
    
    Returns boolean subtoken mask array with shape (num_queries, num_tokens)
    """
    num_queries = encodings['input_ids'].shape[0]
    subtoken_mask = []
    
    if method == 'char_span':
        for query_idx in range(num_queries):
            mask = []
            last_char = -1
            tokens = encodings.tokens(query_idx)
            
            for token_idx, word_id in enumerate(encodings.word_ids(query_idx)):
                if word_id is None:  # skip special tokens
                    mask.append(False)
                    continue
                    
                chars = encodings.token_to_chars(query_idx, token_idx)
                
                if chars[0] == last_char:
                    mask.append(True)
                else:
                    mask.append(False)
                    
                last_char = chars[1]

            subtoken_mask.append(mask)
            
    elif method == 'subtoken_delimiters':
        for query_idx in range(num_queries):
            subtoken_mask.append([token.startswith('##') for token in encodings.tokens(query_idx)])
    else:
        raise ValueError(f"invalid method ('{method}')")
        
    return np.asarray(subtoken_mask)
        

================================================
FILE: jetson_voice/models/nlp/question_answer.py
================================================
#!/usr/bin/env python3
# coding: utf-8

import os
import logging
import numpy as np

from transformers import AutoTokenizer

from jetson_voice.nlp import QuestionAnswerService
from jetson_voice.utils import load_model, normalize_logits
from .nlp_utils import nlp_dynamic_shapes


class QuestionAnswerEngine(QuestionAnswerService):
    """
    Question answering model in TensorRT / onnxruntime.
    """
    def __init__(self, config, *args, **kwargs):
        """
        Load an question answering model from ONNX
        """
        super(QuestionAnswerEngine, self).__init__(config, *args, **kwargs)

        if self.config.type != 'qa':
            raise ValueError(f"{self.config.model_path} isn't a Question Answering model (type '{self.config.type}'")
            
        # load model
        dynamic_shapes = {'max' : (1, self.config['dataset']['max_seq_length'])}  # (batch_size, sequence_length)
        
        if nlp_dynamic_shapes:
            dynamic_shapes['min'] = (1, 1)
        
        self.model = load_model(self.config, dynamic_shapes)
        
        # create tokenizer
        self.tokenizer = AutoTokenizer.from_pretrained(self.config['tokenizer']['tokenizer_name'])
        self.question_first = bool(self.tokenizer.padding_side == "right")
        
        
    def __call__(self, query, top_k=1):
        """
        Perform question/answering on the input query.
        
        Parameters:
          query (dict or tuple) -- Either a dict with 'question' and 'context' keys,
                                   or a (question, context) tuple.
          top_k (int) -- How many of the top results to return, sorted by score.
                         The default (top_k=1) is to return just the top result.
                         If top_k > 1, then a list of results will be returned.
          
        Returns:
          dict(s) with the following keys:
          
             'answer' (string) -- the answer text
             'score' (float) -- the probability [0,1]
             'start' (int) -- the starting character index of the answer into the context text
             'end' (int) -- the ending character index of the answer into the context text
             
          If top_k > 1, a list of dicts with the top_k results will be returned.
          If top_k == 1, just the single dict with the top score will be returned.
        """
        if isinstance(query, dict):
            question = query['question']
            context = query['context']
        elif isinstance(query, tuple):
            question = query[0]
            context = query[1]
        else:
            raise ValueError(f'query must be a dict or tuple (instead was type {type(query).__name__})')

        # check for models that have a doc_stride >= max_seq_length
        # this will cause an exception in the tokenizer
        doc_stride = self.config['dataset']['doc_stride']
        max_seq_len = self.config['dataset']['max_seq_length']
        
        if doc_stride >= max_seq_len:
            doc_stride = int(max_seq_len/2)
            
        # tokenize the inputs
        encodings = self.tokenizer(
            text=question if self.question_first else context,
            text_pair=context if self.question_first else question_text,
            padding='longest' if nlp_dynamic_shapes else 'max_length',
            truncation="only_second" if self.question_first else "only_first",
            max_length=max_seq_len,
            stride=doc_stride,
            return_tensors='np',
            return_token_type_ids=True,
            return_overflowing_tokens=True,
            return_offsets_mapping=True,
            return_special_tokens_mask=True,
        )
        
        # When the input is too long, it's converted in a batch of inputs with overflowing tokens
        # and a stride of overlap between the inputs. If a batch of inputs is given, a special output
        # "overflow_to_sample_mapping" indicate which member of the encoded batch belong to which original batch sample.
        # Here we tokenize examples one-by-one so we don't need to use "overflow_to_sample_mapping".
        # "num_span" is the number of output samples generated from the overflowing tokens.
        num_spans = len(encodings["input_ids"])
        logging.debug(f'num_spans: {num_spans}')

        # p_mask: mask with 1 for token than cannot be in the answer (0 for token which can be in an answer)
        # We put 0 on the tokens from the context and 1 everywhere else (question and special tokens)
        p_mask = np.asarray(
            [
                [tok != 1 if self.question_first else 0 for tok in encodings.sequence_ids(span_id)]
                for span_id in range(num_spans)
            ]
        )

        # keep the cls_token unmasked (some models use it to indicate unanswerable questions)
        if self.tokenizer.cls_token_id is not None:
            cls_index = np.nonzero(encodings["input_ids"] == self.tokenizer.cls_token_id)
            p_mask[cls_index] = 0
            
        # run the model over each span (TODO batching)
        model_outputs = []
        
        for span_idx in range(num_spans):
            inputs = {}
            
            for input in self.model.inputs:
                if input.name not in encodings:
                    raise ValueError(f"the encoded inputs from the tokenizer doesn't contain '{input.name}'")

                inputs[input.name] = np.expand_dims(encodings[input.name][span_idx], axis=0) # add batch dim

            model_outputs.append(self.model.execute(inputs))
            
        # post-processing
        answers = []
        min_null_score = 1000000
        handle_impossible_answer = self.config['dataset']['version_2_with_negative']
        
        for span_idx in range(num_spans):
            start_logits = np.squeeze(model_outputs[span_idx][:,:,0])
            end_logits = np.squeeze(model_outputs[span_idx][:,:,1])

            # Ensure padded tokens & question tokens cannot belong to the set of candidate answers.
            undesired_tokens = np.abs(p_mask[span_idx] - 1) & encodings['attention_mask'][span_idx]

            # Generate mask
            undesired_tokens_mask = (undesired_tokens == 0.0)

            # Make sure non-context indexes in the tensor cannot contribute to the softmax
            start_logits = np.where(undesired_tokens_mask, -10000.0, start_logits)
            end_logits = np.where(undesired_tokens_mask, -10000.0, end_logits)

            # Normalize logits and spans to retrieve the answer
            start_logits = np.exp(start_logits - np.log(np.sum(np.exp(start_logits), axis=-1, keepdims=True)))
            end_logits = np.exp(end_logits - np.log(np.sum(np.exp(end_logits), axis=-1, keepdims=True)))

            if handle_impossible_answer:
                min_null_score = min(min_null_score, (start_logits[0] * end_logits[0]).item())

            # Mask CLS
            start_logits[0] = end_logits[0] = 0.0

            # Decode token probabilities
            starts, ends, scores = self.decode(start_logits, end_logits, top_k=top_k)

            if self.tokenizer.is_fast:
                # Convert the answer (tokens) back to the original text
                # Score: score from the model
                # Start: Index of the first character of the answer in the context string
                # End: Index of the character following the last character of the answer in the context string
                # Answer: Plain text of the answer
                enc = encodings[span_idx]
                
                # Sometimes the max probability token is in the middle of a word so:
                # - we start by finding the right word containing the token with `token_to_word`
                # - then we convert this word in a character span with `word_to_chars`
                for s, e, score in zip(starts, ends, scores):
                    start = enc.word_to_chars(enc.token_to_word(s), sequence_index=1 if self.question_first else 0)[0]
                    end = enc.word_to_chars(enc.token_to_word(e), sequence_index=1 if self.question_first else 0)[1]
                    
                    answers.append({
                        'answer' : context[start : end],
                        'score' : score.item(),
                        'start' : start,
                        'end' : end
                    })
            else:
                raise NotImplementedError('QA post-processing is only implemented for fast tokenizers')
            
        if handle_impossible_answer:
            answers.append({'answer': '', 'score': min_null_score, 'start': 0, 'end': 0})

        answers = sorted(answers, key=lambda x: x['score'], reverse=True)[:top_k]
        
        if top_k == 1:
            return answers[0]
        else:
            return answers


    def decode(self, start: np.ndarray, end: np.ndarray, top_k: int):
        """
        Take the QA model output and will generate probabilities for each span to be the actual answer.
        In addition, it filters out some unwanted/impossible cases like answer len being greater than max_answer_len or
        answer end position being before the starting position. The method supports output the k-best answer through
        the top_k argument.
        Args:
            start (:obj:`np.ndarray`): Individual start probabilities for each token.
            end (:obj:`np.ndarray`): Individual end probabilities for each token.
            top_k (:obj:`int`): Indicates how many possible answer span(s) to extract from the model output.
            max_answer_len (:obj:`int`): Maximum size of the answer to extract from the model's output.
        """
        # Ensure we have batch axis
        if start.ndim == 1:
            start = start[None]

        if end.ndim == 1:
            end = end[None]

        # Compute the score of each tuple(start, end) to be the real answer
        outer = np.matmul(np.expand_dims(start, -1), np.expand_dims(end, 1))

        # Remove candidate with end < start and end - start > max_answer_len
        candidates = np.tril(np.triu(outer), self.config['dataset']['max_answer_length'] - 1)

        #  Inspired by Chen & al. (https://github.com/facebookresearch/DrQA)
        scores_flat = candidates.flatten()
        if top_k == 1:
            idx_sort = [np.argmax(scores_flat)]
        elif len(scores_flat) < top_k:
            idx_sort = np.argsort(-scores_flat)
        else:
            idx = np.argpartition(-scores_flat, top_k)[0:top_k]
            idx_sort = idx[np.argsort(-scores_flat[idx])]

        start, end = np.unravel_index(idx_sort, candidates.shape)[1:]
        return start, end, candidates[0, start, end] 
        

================================================
FILE: jetson_voice/models/nlp/text_classification.py
================================================
#!/usr/bin/env python3
# coding: utf-8

import os
import logging
import numpy as np

from transformers import AutoTokenizer

from jetson_voice.nlp import TextClassificationService
from jetson_voice.utils import load_model, normalize_logits
from .nlp_utils import nlp_dynamic_shapes


class TextClassificationEngine(TextClassificationService):
    """
    Text classification model in TensorRT / onnxruntime.
    """
    def __init__(self, config, *args, **kwargs):
        """
        Load an text classification model from ONNX
        """
        super(TextClassificationEngine, self).__init__(config, *args, **kwargs)

        if self.config.type != 'text_classification':
            raise ValueError(f"{self.config.model_path} isn't a Text Classification model (type '{self.config.type}'")
            
        # load model
        dynamic_shapes = {'max' : (1, self.config['dataset']['max_seq_length'])}  # (batch_size, sequence_length)
        
        if nlp_dynamic_shapes:
            dynamic_shapes['min'] = (1, 1)
        
        self.model = load_model(self.config, dynamic_shapes)
        
        # create tokenizer
        self.tokenizer = AutoTokenizer.from_pretrained(self.config['tokenizer']['tokenizer_name'])
        
        
    def __call__(self, query):
        """
        Perform text classification on the input query.
        
        Parameters:
          query (string) -- The text query, for example:
                             'Today was warm, sunny and beautiful out.'

        Returns a dict with the following keys:
             'class' (int) -- the predicted class index
             'label' (string) -- the predicted class label (and if there aren't labels `str(class)`)
             'score' (float) -- the classification probability [0,1]
        """
        encodings = self.tokenizer(
            text=query,
            padding='longest' if nlp_dynamic_shapes else 'max_length',
            truncation=True,
            max_length=self.config['dataset']['max_seq_length'],
            return_tensors='np',
            return_token_type_ids=True,
            return_overflowing_tokens=True,
            return_offsets_mapping=True,
            return_special_tokens_mask=True,
        )
    
        # retrieve the inputs from the encoded tokens
        inputs = {}
        
        for input in self.model.inputs:
            if input.name not in encodings:
                raise ValueError(f"the encoded inputs from the tokenizer doesn't contain '{input.name}'")

            inputs[input.name] = encodings[input.name]
                    
        # run the model
        logits = self.model.execute(inputs)
        logits = normalize_logits(logits)
        preds  = np.argmax(logits, axis=-1)
 
        # tabulate results
        results = []
        
        for query_idx in range(preds.shape[0]):
            results.append({
                'class' : int(preds[query_idx]),
                'label' : str(preds[query_idx]),
                'score' : logits[query_idx][preds[query_idx]]
            })
            
        if len(results) == 1:
            return results[0]
        else:
            return results
        

================================================
FILE: jetson_voice/models/nlp/token_classification.py
================================================
#!/usr/bin/env python3
# coding: utf-8

import os
import logging
import numpy as np

from transformers import AutoTokenizer

from jetson_voice.nlp import TokenClassificationService
from jetson_voice.utils import load_model, normalize_logits
from .nlp_utils import find_subtokens, nlp_dynamic_shapes


class TokenClassificationEngine(TokenClassificationService):
    """
    Token classification model (aka Named Entity Recognition) in TensorRT / onnxruntime.
    """
    def __init__(self, config, *args, **kwargs):
        """
        Load an token classification model for NER from ONNX
        """
        super(TokenClassificationEngine, self).__init__(config, *args, **kwargs)

        if self.config.type != 'token_classification':
            raise ValueError(f"{self.config.model_path} isn't a Token Classification model (type '{self.config.type}'")
            
        # load model
        dynamic_shapes = {'max' : (1, self.config['dataset']['max_seq_length'])}  # (batch_size, sequence_length)
        
        if nlp_dynamic_shapes:
            dynamic_shapes['min'] = (1, 1)
        
        self.model = load_model(self.config, dynamic_shapes)
        
        # create tokenizer
        self.tokenizer = AutoTokenizer.from_pretrained(self.config['tokenizer']['tokenizer_name'])
        
        
    def __call__(self, query):
        """
        Perform token classification (NER) on the input query and return tagged entities.
        
        Parameters:
          query (string) -- The text query, for example:
                             "Ben is from Chicago, a city in the state of Illinois, US'

        Returns a list[dict] of tagged entities with the following dictionary keys:
             'class' (int) -- the entity class index
             'label' (string) -- the entity class label
             'score' (float) -- the classification probability [0,1]
             'text'  (string) -- the corresponding text from the input query
             'start' (int) -- the starting character index of the text
             'end'   (int) -- the ending character index of the text
        """
        encodings = self.tokenizer(
            text=query,
            padding='longest' if nlp_dynamic_shapes else 'max_length',
            truncation=True,
            max_length=self.config['dataset']['max_seq_length'],
            return_tensors='np',
            return_token_type_ids=True,
            return_overflowing_tokens=True,
            return_offsets_mapping=True,
            return_special_tokens_mask=True,
        )
    
        # during token classification, we want to ignore slots from subtokens and special tokens 
        subtoken_mask = find_subtokens(encodings)
        ignore_mask = subtoken_mask | encodings['special_tokens_mask']
        
        # retrieve the inputs from the encoded tokens
        inputs = {}
        
        for input in self.model.inputs:
            if input.name not in encodings:
                raise ValueError(f"the encoded inputs from the tokenizer doesn't contain '{input.name}'")

            inputs[input.name] = encodings[input.name]
                    
        # run the model
        logits = self.model.execute(inputs)
        logits = normalize_logits(logits)
        
        preds = np.argmax(logits, axis=-1)
        probs = np.amax(logits, axis=-1)
        
        # tabulate results
        tags = []
        label_map = {v: k for k, v in self.config['label_ids'].items()}
        num_queries, num_tokens, _ = logits.shape
        
        assert num_queries == 1  # there should only be 1 input query currently
        
        for query_idx in range(num_queries):
            query_tags = []
            
            for token_idx in range(num_tokens):
                label = label_map[preds[query_idx][token_idx]]
                
                # ignore unclassified slots or masked tokens
                if label == self.config['dataset']['pad_label'] or ignore_mask[query_idx][token_idx]:
                    continue

                # convert from token index back to the query string
                chars = encodings.token_to_chars(query_idx, token_idx)
                
                # append subtokens from the query to the text
                for subtoken_idx in range(token_idx+1, num_tokens):
                    if subtoken_mask[query_idx][subtoken_idx]:
                        chars = (chars[0], encodings.token_to_chars(query_idx, subtoken_idx)[1])
                    else:
                        break

                text = query[chars[0]:chars[1]] # queries[query_idx]

                # strip out punctuation to attach the entity tag to the word not to a punctuation mark
                if not text[-1].isalpha():
                    text = text[:-1]
                    chars = (chars[0], chars[1]-1)
                        
                query_tags.append({
                    'label' : label,
                    'class' : preds[query_idx][token_idx],
                    'score' : probs[query_idx][token_idx],
                    'text' : text,
                    'start' : chars[0],
                    'end' : chars[1]
                })
                
            tags.append(query_tags)
            
        if len(tags) == 1:
            return tags[0]
        else:
            return tags
        

================================================
FILE: jetson_voice/models/tts/__init__.py
================================================
#!/usr/bin/env python3
# coding: utf-8

from .tts_engine import TTSEngine


================================================
FILE: jetson_voice/models/tts/tts_engine.py
================================================
#!/usr/bin/env python3
# coding: utf-8

import os
import re
import logging
import inflect

import numpy as np

from jetson_voice.tts import TTSService
from jetson_voice.utils import global_config, load_model, softmax

      
class TTSEngine(TTSService):
    """
    Text-to-speech synthesis.  This is actually a pipeline of two models,
    the generator model (which generates MEL spectrograms from tokens),
    and the vocoder (which outputs audio from MEL spectrograms)
    """
    def __init__(self, config, *args, **kwargs):
        """
        Loads a streaming ASR model from ONNX or serialized TensorRT engine.
        
        Parameters:
          model (string) -- path to ONNX model or serialized TensorRT engine/plan
          config (string) -- path to model configuration json (will be inferred from model path if empty)
        """
        super(TTSEngine, self).__init__(config, *args, **kwargs)

        if self.config.type != 'tts':
            raise ValueError(f"{self.config.model_path} isn't a Text-to-Speech model (type '{self.config.type}'")
            
        # load text->MEL generator model
        self.generator = load_model(self.config.generator)
        
        # load MEL->audio vocoder model
        features = self.config.vocoder.features
        
        dynamic_shapes = {
            'min' : (1, features, 1),
            'opt' : (1, features, 160), # ~5-6 words
            'max' : (1, features, 1024) # ~20-30 words?
        }
        
        self.vocoder = load_model(self.config.vocoder, dynamic_shapes=dynamic_shapes)
        
        # create map of symbol->ID embeddings
        self.symbol_to_id = {s: i for i, s in enumerate(self.get_symbols())}
        
        # create operators for num-to-word conversion
        self.number_regex = re.compile(r'\d+(?:,\d+)?')  # https://stackoverflow.com/a/16321189
        self.number_inflect = inflect.engine()
        
    def __call__(self, text):
        """
        Generate audio from text.
        
        Parameters:
          text (string) -- The phrase to convert to audio.

        Returns audio samples in a numpy array.
        """
        text = self.numbers_to_words(text)   # vocab doesn't include numbers, so convert them to words
        
        pad_symbol = ' '
        min_length = 6
        
        if text[-1].isalnum():      # end with punctuation, otherwise audio is cut-off
            text += pad_symbol
          
        if len(text) < min_length:  # WAR for cuDNN error on JetPack <= 4.5.x
            text = text.ljust(min_length, pad_symbol)
            
        # convert chars to symbol embeddings
        encoded_text = [self.symbol_to_id[s] for s in text.lower() if s in self.symbol_to_id]
        encoded_text = np.expand_dims(np.array(encoded_text, dtype=np.int64), axis=0)
        
        # generate MEL spectrogram + audio
        mels = self.generator.execute(encoded_text)[0]
        audio = self.vocoder.execute(mels)

        return audio.squeeze()
     
    def get_symbols(self):
        """
        Return a list of all the accepted character symbols / embeddings
        """
        _arpabet = [
          'AA', 'AA0', 'AA1', 'AA2', 'AE', 'AE0', 'AE1', 'AE2', 'AH', 'AH0', 'AH1', 'AH2',
          'AO', 'AO0', 'AO1', 'AO2', 'AW', 'AW0', 'AW1', 'AW2', 'AY', 'AY0', 'AY1', 'AY2',
          'B', 'CH', 'D', 'DH', 'EH', 'EH0', 'EH1', 'EH2', 'ER', 'ER0', 'ER1', 'ER2', 'EY',
          'EY0', 'EY1', 'EY2', 'F', 'G', 'HH', 'IH', 'IH0', 'IH1', 'IH2', 'IY', 'IY0', 'IY1',
          'IY2', 'JH', 'K', 'L', 'M', 'N', 'NG', 'OW', 'OW0', 'OW1', 'OW2', 'OY', 'OY0',
          'OY1', 'OY2', 'P', 'R', 'S', 'SH', 'T', 'TH', 'UH', 'UH0', 'UH1', 'UH2', 'UW',
          'UW0', 'UW1', 'UW2', 'V', 'W', 'Y', 'Z', 'ZH'
        ]
        _arpabet = ['@' + s for s in _arpabet]
        _pad = '_'
        _punctuation = '!\'(),.:;? '
        _special = '-'
        _letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz'
        symbols = list(_pad + _special + _punctuation + _letters) + _arpabet
        return symbols
     
    def numbers_to_words(self, text):
        """
        Convert instances of numbers to words in the text.
        For example:  "The answer is 42" -> "The answer is forty two."
        """
        number_tokens = self.number_regex.findall(text)
        
        for number_token in number_tokens:
            # TODO test/handle floating-point numbers
            word_text = self.number_inflect.number_to_words(number_token)              
            num_begin = text.index(number_token)

            # insert the words back at the old location
            text = text[:num_begin] + word_text + text[num_begin + len(number_token):]
            
        return text
        
    @property
    def sample_rate(self):
        """
        Get the output sample rate (e.g. 22050, 44100, ect)
        """
        return self.config['vocoder']['sample_rate']

================================================
FILE: jetson_voice/nlp.py
================================================
#!/usr/bin/env python3
# coding: utf-8

from jetson_voice.utils import load_resource


def NLP(resource, *args, **kwargs):
    """
    Factory for automatically loading NLP models or services.
    
    Returns an instance of:
        - IntentSlotService
        - QuestionAnswerService
        - TextClassificationService
        - TokenClassificationService
    """
    from jetson_voice.auto import AutoModel
    return AutoModel(resource, domain='nlp', *args, **kwargs)
    
    
def IntentSlot(resource, *args, **kwargs):
    """
    Loads a NLP joint intent/slot classifier service or model.
    See the IntentSlotService class for the signature that implementations use.
    """
    factory_map = {
        'tensorrt' : 'jetson_voice.models.nlp.IntentSlotEngine',
        'onnxruntime' : 'jetson_voice.models.nlp.IntentSlotEngine'
    }
    
    return load_resource(resource, factory_map, *args, **kwargs)

    
class IntentSlotService():
    """
    Intent/slot classifier service base class.
    """
    def __init__(self, config, *args, **kwargs):
        """
        Create service instance.
        """
        self.config = config
        
    def __call__(self, query):
        """
        Perform intent/slot classification on the input query.
        
        Parameters:
          query (string) -- The text query, for example:
                             'What is the weather in San Francisco tomorrow?'

        Returns a dict with the following keys:
             'intent' (string) -- the classified intent label
             'score' (float) -- the intent probability [0,1]
             'slots' (list[dict]) -- a list of dicts, where each dict has the following keys:
                  'slot' (string) -- the slot label
                  'text' (string) -- the slot text from the query
                  'score' (float) -- the slot probability [0,1]
        """
        pass

 
def QuestionAnswer(resource, *args, **kwargs):
    """
    Loads a NLP question answering service or model.
    See the QuestionAnswerService class for the signature that implementations use.
    """
    factory_map = {
        'tensorrt' : 'jetson_voice.models.nlp.QuestionAnswerEngine',
        'onnxruntime' : 'jetson_voice.models.nlp.QuestionAnswerEngine'
    }
    
    return load_resource(resource, factory_map, *args, **kwargs) 
        
   
class QuestionAnswerService():
    """
    Question answering service base class.
    """
    def __init__(self, config, *args, **kwargs):
        """
        Create service instance.
        """
        self.config = config
        
    def __call__(self, query, top_k=1):
        """
        Perform question/answering on the input query.
        
        Parameters:
          query (dict or tuple) -- Either a dict with 'question' and 'context' keys,
                                   or a (question, context) tuple.
          top_k (int) -- How many of the top results to return, sorted by score.
                         The default (topk=1) is to return just the top result.
                         If topk > 1, then a list of results will be returned.
          
        Returns:
          dict(s) with the following keys:
          
             'answer' (string) -- the answer text
             'score' (float) -- the probability [0,1]
             'start' (int) -- the starting character index of the answer into the context text
             'end' (int) -- the ending character index of the answer into the context text
             
          If top_k > 1, a list of dicts with the topk results will be returned.
          If top_k == 1, just the single dict with the top score will be returned.
        """
        pass
        

def TextClassification(resource, *args, **kwargs):
    """
    Loads a NLP text classification service or model.
    See the TextClassificationService class for the signature that implementations use.
    """
    factory_map = {
        'tensorrt' : 'jetson_voice.models.nlp.TextClassificationEngine',
        'onnxruntime' : 'jetson_voice.models.nlp.TextClassificationEngine'
    }
    
    return load_resource(resource, factory_map, *args, **kwargs) 
        
   
class TextClassificationService():
    """
    Text classification service base class.
    """
    def __init__(self, config, *args, **kwargs):
        """
        Create service instance.
        """
        self.config = config
        
    def __call__(self, query):
        """
        Perform text classification on the input query.
        
        Parameters:
          query (string) -- The text query, for example:
                             'Today was warm, sunny and beautiful out.'

        Returns a dict with the following keys:
             'class' (int) -- the predicted class index
             'label' (string) -- the predicted class label (and if there aren't labels `str(class)`)
             'score' (float) -- the classification probability [0,1]
        """
        pass


def TokenClassification(resource, *args, **kwargs):
    """
    Loads a NLP token classification (aka Named Entity Recognition) service or model.
    See the TokenClassificationService class for the signature that implementations use.
    """
    factory_map = {
        'tensorrt' : 'jetson_voice.models.nlp.TokenClassificationEngine',
        'onnxruntime' : 'jetson_voice.models.nlp.TokenClassificationEngine'
    }
    
    return load_resource(resource, factory_map, *args, **kwargs) 
        
   
class TokenClassificationService():
    """
    Token classification (aka Named Entity Recognition) service base class.
    """
    def __init__(self, config, *args, **kwargs):
        """
        Create service instance.
        """
        self.config = config
        
    def __call__(self, query):
        """
        Perform token classification (NER) on the input query and return tagged entities.
        
        Parameters:
          query (string) -- The text query, for example:
                             "Ben is from Chicago, a city in the state of Illinois, US'

        Returns a list[dict] of tagged entities with the following dictionary keys:
             'class' (int) -- the entity class index
             'label' (string) -- the entity class label
             'score' (float) -- the classification probability [0,1]
             'text'  (string) -- the corresponding text from the input query
             'start' (int) -- the starting character index of the text
             'end'   (int) -- the ending character index of the text
        """
        pass

    @staticmethod
    def tag_string(query, tags, scores=False):
        """
        Returns a string with the tags inserted inline with the query.  For example:
        
        "Ben[B-PER] is from Chicago[B-LOC], a city in the state of Illinois[B-LOC], US[B-LOC]"
        
        Parameters:
          query  (string) -- The original query string.
          tags   (list[dict]) -- The tags predicted by the model.
          scores (bool) -- If true, the probabilities will be added inline.
                           If false (default), only the tag labels will be added.
        """
        char_offset = 0

        for tag in tags:
            if scores:
                tag_str = f"[{tag['label']} {tag['score']:.3}]"
            else:
                tag_str = f"[{tag['label']}]"
                
            query = query[:tag['end'] + char_offset] + tag_str + query[tag['end'] + char_offset:]
            char_offset += len(tag_str)
            
        return query
        
        
if __name__ == "__main__":

    from jetson_voice import ConfigArgParser
    import pprint
    
    parser = ConfigArgParser()
    
    parser.add_argument('--model', default='distilbert_intent', type=str)
    parser.add_argument('--type', default='intent_slot', type=str)

    args = parser.parse_args()
    args.type = args.type.lower()
    
    print(args)
    
    if args.type == 'intent_slot':
    
        model = IntentSlot(args.model)
        
        # create some test queries
        queries = [
            'Set alarm for Seven Thirty AM',
            'Please increase the volume',
            'What is my schedule for tomorrow',
            'Place an order for a large pepperoni pizza from Dominos'
        ]

        # process the queries
        for query in queries:
            results = model(query)
            
            print('\n')
            print('query:', query)
            print('')
            pprint.pprint(results)
     
    elif args.type == 'question_answer' or args.type == 'qa':

        model = QuestionAnswer(args.model)
        
        # create some test queries
        queries = []
        
        queries.append({
            "question" : "What is the value of Pi?",
            "context" : "Some people have said that Pi is tasty but there should be a value for Pi, and the value for Pi is around 3.14. "
                        "Pi is the ratio of a circle's circumference to it's diameter. The constant Pi was first calculated by Archimedes "
                        "in ancient Greece around the year 250 BC."
        })
        
        queries.append({
            "question" : "Who discovered Pi?",
            "context" : queries[-1]['context']
        })

        queries.append({
            "question" : "Which nation contains the majority of the Amazon forest?",
            "context" : "The Amazon rainforest is a moist broadleaf forest that covers most of the Amazon basin of South America. "
                        "This basin encompasses 7,000,000 square kilometres (2,700,000 sq mi), of which 5,500,000 square kilometres "
                        "(2,100,000 sq mi) are covered by the rainforest. The majority of the forest is contained within Brazil, "
                        "with 60% of the rainforest, followed by Peru with 13%, and Colombia with 10%."
        })
        
        queries.append({
            "question" : "How large is the Amazon rainforest?",
            "context" : queries[-1]['context']
        })
        
        # process the queries
        for query in queries:
            answers = model(query, top_k=5)
            
            print('\n')
            print('context:', query['context'])
            print('')
            print('question:', query['question'])
            
            for answer in answers:
                print('')
                print('answer:  ', answer['answer'])
                print('score:   ', answer['score'])
    
    elif args.type == 'text_classification':
    
        model = TextClassification(args.model)
        
        # create some test queries (these are for sentiment models)
        queries = [
            "By the end of no such thing the audience, like beatrice, has a watchful affection for the monster.",
            "Director Rob Marshall went out gunning to make a great one.",
            "Uneasy mishmash of styles and genres.",
            "I love exotic science fiction / fantasy movies but this one was very unpleasant to watch. I gave it 4 / 10 since some special effects were nice.",
            "Today was cold and rainy and not very nice.",
            "Today was warm, sunny and beautiful out.",
        ]

        # process the queries
        for query in queries:
            results = model(query)
            print('\nquery:', query)
            pprint.pprint(results)
    
    elif args.type == 'token_classification':
    
        model = TokenClassification(args.model)
    
        # create some test queries
        queries = [
            "But candidate Charles Baker, who has about eight percent of the vote, has called for an investigation into reports of people voting multiple times.",
            "Analysts say Mr. Chung's comments may be part of efforts by South Korea to encourage North Korea to resume bilateral talks.",
            "The 63-year-old Daltrey walked offstage during the first song; guitarist Pete Townshend later told the crowd he was suffering from bronchitis and could barely speak.",
            "The Who is currently touring in support of Endless Wire, its first album since 1982.",
            "Meanwhile, Iowa is cleaning up after widespread flooding inundated homes, destroyed crops and cut off highways and bridges.",
            "At the White House Tuesday, U.S. President George Bush expressed concern for the flood victims.",
            "Ben is from Chicago, a city in the state of Illinois, US with a population of 2.7 million people.",
            "Lisa's favorite place to climb in the summer is El Capitan in Yosemite National Park in California, U.S."
        ]

        # process the queries
        for query in queries:
            tags = model(query)
            #print(f'\n{query}')
            #pprint.pprint(tags)
            print(f'\n{model.tag_string(query, tags, scores=True)}')
        
    else: 
        raise ValueError(f"invalid --type argument ({args.type})")
        

================================================
FILE: jetson_voice/tts.py
================================================
#!/usr/bin/env python3
# coding: utf-8

from jetson_voice.utils import load_resource


def TTS(resource, *args, **kwargs):
    """
    Loads a TTS service or model.
    See the TTSService class for the signature that implementations use.
    """
    factory_map = {
        'riva' : 'jetson_voice.backends.riva.RivaTTSService',
        'tensorrt' : 'jetson_voice.models.tts.TTSEngine',
        'onnxruntime' : 'jetson_voice.models.tts.TTSEngine'
    }
    
    return load_resource(resource, factory_map, *args, **kwargs)

    
class TTSService():
    """
    TTS service base class.
    """
    def __init__(self, config, *args, **kwargs):
        """
        Create service instance.
        """
        self.config = config
        
    def __call__(self, text):
        """
        Generate audio from text.
        
        Parameters:
          text (string) -- The phrase to convert to audio.

        Returns audio samples in a numpy array.
        """
        pass
    
    @property
    def sample_rate(self):
        """
        Get the output sample rate (in Hz)
        """
        pass
        
        
if __name__ == "__main__":

    from jetson_voice import list_audio_devices, ConfigArgParser
    from soundfile import SoundFile
    
    import pprint
    import pyaudio
    import time
    
    parser = ConfigArgParser()
    
    parser.add_argument('--model', default='fastpitch_hifigan', type=str)
    parser.add_argument('--text', default='Hello, how are you today?', type=str)
    parser.add_argument('--warmup', type=int, default=9, help='the number of warmup runs')
    parser.add_argument("--output-device", type=int, default=None, help='output audio device to use')
    parser.add_argument("--output-wav", type=str, default=None, help='output wav file to write to')
    parser.add_argument('--list-devices', action='store_true', help='list audio input devices')
    
    args = parser.parse_args()
    print(args)
    
    # list audio devices
    if args.list_devices:
        list_audio_devices()
        
    # load the model
    tts = TTS(args.model)
    
     # display the text
    print(f"\n'{args.text}'\n")
    
    # run the TTS
    for run in range(args.warmup+1):
        start = time.perf_counter()
        audio = tts(args.text)
        stop = time.perf_counter()
        latency = stop-start
        duration = audio.shape[0]/tts.sample_rate
        print(f"Run {run} -- Time to first audio: {latency:.3f}s. Generated {duration:.2f}s of audio. RTFx={duration/latency:.2f}.")
        
    # output the audio
    if args.output_device is not None:
        p = pyaudio.PyAudio()
        stream = p.open(output_device_index=args.output_device, 
                        format=pyaudio.paFloat32, 
                        channels=1, rate=tts.sample_rate, output=True)
        stream.write(audio.tobytes())
        stream.close_stream()
        stream.close()
        
    if args.output_wav is not None:
        wav = SoundFile(args.output_wav, mode='w', samplerate=tts.sample_rate, channels=1)
        wav.write(audio)
        wav.close()
        print(f"Wrote audio to {args.output_wav}")
    

================================================
FILE: jetson_voice/utils/__init__.py
================================================
#!/usr/bin/env python3
# coding: utf-8

from .config import global_config, ConfigDict, ConfigArgParser
from .resource import find_resource, load_resource, load_model, list_models

from .audio import *
from .softmax import softmax, normalize_logits

================================================
FILE: jetson_voice/utils/audio.py
================================================
#!/usr/bin/env python3
# coding: utf-8

import os
import math
import pprint
import logging
import librosa
import soundfile

import pyaudio as pa
import numpy as np


def audio_db(samples):
    """
    Compute RMS of audio samples in dB.
    """
    rms = librosa.feature.rms(y=samples, frame_length=samples.shape[0], center=False)[0][0]

    if rms != 0.0:
        return 20.0 * math.log10(rms)
    else:
        return -100.0
        
        
def audio_to_float(samples):
    """
    Convert audio samples to 32-bit float in the range [-1,1]
    """
    if samples.dtype == np.float32:
        return samples
        
    return samples.astype(np.float32) / 32768
  

def audio_to_int16(samples):
    """
    Convert audio samples to 16-bit float in the range [-32767,32767]
    """
    if samples.dtype == np.int16:
        return samples
    elif samples.dtype == np.float32:
        return (samples * 32768).astype(np.int16)
    else:
        return samples.astype(np.int16)
        
    
def AudioInput(wav=None, mic=None, sample_rate=16000, chunk_size=16000):
    """
    Create an audio input stream from wav file or microphone.
    Either the wav or mic argument needs to be specified.
    
    Parameters:
        wav (string) -- path to .wav file
        mic (int) -- microphone device index
        sample_rate (int) -- the desired sample rate in Hz
        chunk_size (int) -- the number of samples returned per next() iteration
        
    Returns AudioWavStream or AudioMicStream
    """
    if mic is not None and mic != '':
        return AudioMicStream(mic, sample_rate=sample_rate, chunk_size=chunk_size)
    elif wav is not None and wav != '':
        return AudioWavStream(wav, sample_rate=sample_rate, chunk_size=chunk_size)
    else:
        raise ValueError('either wav or mic argument must be specified')
 
 
class AudioWavStream:
    """
    Audio playback stream from .wav file
    """
    def __init__(self, filename, sample_rate, chunk_size):
        self.filename = filename
        self.chunk_size = chunk_size
        self.sample_rate = sample_rate
                
        if not os.path.isfile(filename):
            raise IOError(f'could not find file {filename}')
            
        logging.info(f"loading audio '{filename}'")
        
        self.samples, _ = librosa.load(filename, sr=sample_rate, mono=True)
        self.position = 0

    def open(self):
        pass
        
    def close(self):
        pass
        
    def reset(self):
        self.position = 0
        
    def next(self):
        if self.position >= len(self.samples):
            return None
        
        chunk = self.samples[self.position : min(self.position + self.chunk_size, len(self.samples))]
        
        if len(chunk) < self.chunk_size:
            chunk = np.pad(chunk, (0, self.chunk_size-len(chunk)), mode='constant')
            
        self.position += self.chunk_size
        return chunk
        
    def __next__(self):
        samples = self.next()
        
        if samples is None:
            raise StopIteration
        else:
            return samples
        
    def __iter__(self):
        self.position = 0
        return self


class AudioMicStream:
    """
    Live audio stream from microphone input device.
    """
    def __init__(self, device, sample_rate, chunk_size):
        self.stream = None
        self.interface = pa.PyAudio()
        
        self.device_info = find_audio_device(device, self.interface)
        self.device_id = self.device_info['index']
        self.device_sample_rate = sample_rate
        self.device_chunk_size = chunk_size
        
        self.sample_rate = sample_rate
        self.chunk_size = chunk_size
        
        print('Audio Input Device:')
        pprint.pprint(self.device_info)
    
    def __del__(self):
        self.close()
        self.interface.terminate()
        
    def open(self):
        if self.stream:
            return
        
        sample_rates = [self.sample_rate, int(self.device_info['defaultSampleRate']), 16000, 22050, 32000, 44100]
        chunk_sizes = []
        
        for sample_rate in sample_rates:
            chunk_sizes.append(int(self.chunk_size * sample_rate / self.sample_rate))
            
        for sample_rate, chunk_size in zip(sample_rates, chunk_sizes):
            try:    
                logging.info(f'trying to open audio input {self.device_id} with sample_rate={sample_rate} chunk_size={chunk_size}')
                
                self.stream = self.interface.open(format=pa.paInt16,
                                channels=1,
                                rate=sample_rate,
                                input=True,
                                input_device_index=self.device_id,
                                frames_per_buffer=chunk_size)
                                
                self.device_sample_rate = sample_rate
                self.device_chunk_size = chunk_size
                
                break
                
            except OSError as err:
                print(err)
                logging.warning(f'failed to open audio input {self.device_id} with sample_rate={sample_rate}')
                self.stream = None
                
        if self.stream is None:
            logging.error(f'failed to open audio input device {self.device_id} with any of these sample rates:')
            logging.error(str(sample_rates))
            raise ValueError(f"audio input device {self.device_id} couldn't be opened or does not support any of the above sample rates")
                      
        print(f"\naudio stream opened on device {self.device_id} ({self.device_info['name']})")
        print("you can begin speaking now... (press Ctrl+C to exit)\n")
            
    def close(self):
        if self.stream is not None:
            self.stream.stop_stream()
            self.stream.close()
            self.stream = None
     
    def reset(self):
        self.close()
        self.open()
        
    def next(self):
        self.open()
            
        samples = self.stream.read(self.device_chunk_size, exception_on_overflow=False)
        samples = np.frombuffer(samples, dtype=np.int16)
        
        if self.sample_rate != self.device_sample_rate:
            samples = audio_to_float(samples)
            samples = librosa.resample(samples, self.device_sample_rate, self.sample_rate)
            
            if len(samples) != self.chunk_size:
                logging.warning(f'resampled input audio has {len(samples)}, but expected {self.chunk_size} samples')
                
        return samples
        
    def __next__(self):
        samples = self.next()
        
        if samples is None:
            raise StopIteration
        else:
            return samples
        
    def __iter__(self):
        self.open()
        return self
        

class AudioOutput:
    """
    Audio output stream to a speaker.
    """
    def __init__(self, device, sample_rate, chunk_size=4096):
        self.stream = None
        
        if device is None:
            self.device_id = None
            logging.warning(f"creating pass-through audio output without a device")
            return
            
        self.interface = pa.PyAudio()
        self.device_info = find_audio_device(device, self.interface)
        self.device_id = self.device_info['index']
        self.chunk_size = chunk_size
        self.sample_rate = sample_rate
        self.requested_rate = sample_rate
        
        print('Audio Output Device:')
        pprint.pprint(self.device_info)
        
        self.open()
    
    def __del__(self):
        if self.device_id is None:
            return
            
        self.close()
        self.interface.terminate()
        
    def open(self):
        if self.stream or self.device_id is None:
            return
            
        try:
            self.stream = self.interface.open(format=pa.paFloat32,
                            channels=1, rate=self.sample_rate,
                            frames_per_buffer=self.chunk_size,
                            output=True, output_device_index=self.device_id)
        except:
            self.sample_rate = int(self.device_info['defaultSampleRate'])
            logging.error(f"failed to open audio output device with sample_rate={self.requested_rate}, trying again with sample_rate={self.sample_rate}")
            
            self.stream = self.interface.open(format=pa.paFloat32,
                            channels=1, rate=self.sample_rate,
                            frames_per_buffer=self.chunk_size,
                            output=True, output_device_index=self.device_id)
        
        logging.info(f"opened audio output device {self.device_id} ({self.device_info['name']})")
        
    def close(self):
        if self.stream is not None:
            self.stream.stop_stream()
            self.stream.close()
            self.stream = None
       
    def write(self, samples):
        if self.device_id is None:
            return
            
        self.open()
        samples = audio_to_float(samples)
        
        if self.requested_rate != self.sample_rate:
            samples = librosa.resample(samples, self.requested_rate, self.sample_rate)
            #wav = soundfile.SoundFile('data/audio/resample_test.wav', mode='w', samplerate=self.sample_rate, channels=1)
            #wav.write(samples)
            #wav.close()
            
        self.stream.write(samples.tobytes())
        
        
#
# device enumeration
# 
_audio_device_info = None

def _get_audio_devices(audio_interface=None):
    global _audio_device_info
    
    if _audio_device_info:
        return _audio_device_info
        
    if audio_interface:
        interface = audio_interface
    else:
        interface = pa.PyAudio()
        
    info = interface.get_host_api_info_by_index(0)
    numDevices = info.get('deviceCount')
    
    _audio_device_info = []
    
    for i in range(0, numDevices):
        _audio_device_info.append(interface.get_device_info_by_host_api_device_index(0, i))
    
    if not audio_interface:
        interface.terminate()
        
    return _audio_device_info
     
     
def find_audio_device(device, audio_interface=None):
    """
    Find an audio device by it's name or ID number.
    """
    devices = _get_audio_devices(audio_interface)
    
    try:
        device_id = int(device)
    except ValueError:
        if not isinstance(device, str):
            raise ValueError("expected either a string or an int for 'device' parameter")
            
        found = False
        
        for id, dev in enumerate(devices):
            if device.lower() == dev['name'].lower():
                device_id = id
                found = True
                break
                
        if not found:
            raise ValueError(f"could not find audio device with name '{device}'")
            
    if device_id < 0 or device_id >= len(devices):
        raise ValueError(f"invalid audio device ID ({device_id})")
        
    return devices[device_id]
                
   
def list_audio_inputs():
    """
    Print out information about present audio input devices.
    """
    devices = _get_audio_devices()

    print('')
    print('----------------------------------------------------')
    print(f" Audio Input Devices")
    print('----------------------------------------------------')
        
    for i, dev_info in enumerate(devices):    
        if (dev_info.get('maxInputChannels')) > 0:
            print("Input Device ID {:d} - '{:s}' (inputs={:.0f}) (sample_rate={:.0f})".format(i,
                  dev_info.get('name'), dev_info.get('maxInputChannels'), dev_info.get('defaultSampleRate')))
                 
    print('')
    
    
def list_audio_outputs():
    """
    Print out information about present audio output devices.
    """
    devices = _get_audio_devices()
    
    print('')
    print('----------------------------------------------------')
    print(f" Audio Output Devices")
    print('----------------------------------------------------')
        
    for i, dev_info in enumerate(devices):  
        if (dev_info.get('maxOutputChannels')) > 0:
            print("Output Device ID {:d} - '{:s}' (outputs={:.0f}) (sample_rate={:.0f})".format(i,
                  dev_info.get('name'), dev_info.get('maxOutputChannels'), dev_info.get('defaultSampleRate')))
                  
    print('')
    
    
def list_audio_devices():
    """
    Print out information about present audio input and output devices.
    """
    list_audio_inputs()
    list_audio_outputs()

              
================================================
FILE: jetson_voice/utils/config.py
================================================
#!/usr/bin/env python3
# coding: utf-8

import os
import json
import pprint
import logging
import argparse


#
# Default global configuration
#
# This can be overriden at runtime with command-line options (see ConfigArgParser)
# such as --global-config to load your own configuration from json file,
# or by calling config.load('my_config.json')
#
# You can also set the options directly on the 'config' object, e.g.
#
#    config.model_dir = '/path/to/my/models'
#    config.log_level = 'warning'
#
# It's recommended to use one of the methods above instead of changing _default_config directly.
#
_default_global_config = {
    'version' : 0.1,
    'model_dir' : '/jetson-voice/data/networks',
    'model_manifest' : '/jetson-voice/data/networks/manifest.json',
    'default_backend' : 'tensorrt',
    'log_level' : 'info',
    'debug' : False,
    'profile' : False
}


class ConfigDict(dict):
    """
    Configuration dict that can be loaded from JSON and has members
    accessible via attributes and can watch for updates to keys.
    """
    def __init__(self, *args, path=None, watch=None, **kwargs):
        """
        Parameters:
          path (str) -- Path to JSON file to load from
          
          watch (function or dict) -- A callback function that gets called when a key is set.
                                      Should a function signature like my_watch(key, value)
                                      This can also be a dict of key names and functions,
                                      and each function will only be called when it's particular
                                      key has been set.  You can also subclass ConfigDict and
                                      override the __watch__() member function.
        """                                
                                         
        super(ConfigDict, self).__init__(*args, **kwargs)
        
        self.__dict__['path'] = path
        self.__dict__['watch'] = watch
        
        for x in args:
            if isinstance(x, dict):
                for y in x:
                    self.__watch__(y, x[y])
                    
        for x in kwargs:
            self.__watch__(x, kwargs[x])
               
        if path:
            self.load(path)
            
    def load(self, path, clear=False):
        """
        Load from JSON file.
        """
        from .resource import find_resource  # import here to avoid circular dependency
        
        path = find_resource(path)
        self.__dict__['path'] = path
        
        if clear:
            self.clear()
            
        with open(path) as file:
            config_dict = json.load(file)
        
        self.update(config_dict)
        
    def __getattr__(self, attr):
        if attr in self.__dict__:
            return self.__dict__[attr]
        else:
            return self[attr]
        
    def __setattr__(self, attr, value):
        if attr in self.__dict__:
            self.__dict__[attr] = value
        else:
            self[attr] = value
        
    def __setitem__(self, key, value):
        if isinstance(value, dict):
            value = ConfigDict(value, watch=self.watch)
            value.__dict__['path'] = self.path
            
        super(ConfigDict, self).__setitem__(key, value)
        self.__watch__(key, value)
    
    def __watch__(self, key, value):
        #print(f'watch {key} -> {value}')

        if not self.watch:
            return
            
        if isinstance(self.watch, dict):
            if key in self.watch:
                self.watch[key](key, value)
        else:
            self.watch(key, value)
            
    def __str__(self):
        return pprint.pformat(self)
        
    #def __repr__(self):
    #    return pprint.saferepr(self)
        
    def setdefault(self, key, default=None):
        if isinstance(default, dict):
            value = ConfigDict(value, watch=self.watch)
            value.__dict__['path'] = self.path
            
        changed = key not in self
        value = super(ConfigDict, self).setdefault(key, default)
        
        if changed: 
            self.__watch__(key, value)
        
    def update(self, *args, **kwargs):
        for k, v in dict(*args, **kwargs).items():
            self[k] = v
        

#
# logging handlers
#
logging.basicConfig(format='[%(asctime)s] %(filename)s:%(lineno)d - %(message)s', datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO) 

global_config = None

def _set_log_level(key, value):
    log_value = value.upper()
    
    if log_value == 'VERBOSE':
        log_value = 'DEBUG'
        
    log_level = getattr(logging, log_value, None)
    
    if not isinstance(log_level, int):
        raise ValueError(f'Invalid log level: {value}')
       
    logging.getLogger().setLevel(log_level)
    logging.debug(f'set logging level to {value}')

    if global_config is not None and value.upper() == 'DEBUG':
        global_config['debug'] = True
    
#
# global config definition
#
global_config = ConfigDict(_default_global_config, watch={'log_level':_set_log_level})

if global_config.log_level.upper() == 'DEBUG':
    global_config['debug'] = True
    
logging.debug(f'global config:\n{global_config}')


#
# custom arg parser
#
class ConfigArgParser(argparse.ArgumentParser):
    """
    ArgumentParser that provides global configuration options.
    """
    def __init__(self, *args, **kwargs):
        super(ConfigArgParser, self).__init__(*args, **kwargs)
    
        self.add_argument('--global-config', default=None, type=str, help='path to JSON file to load global configuration from')
        self.add_argument('--model-dir', default=_default_global_config['model_dir'], help=f"sets the root path of the models (default '{_default_global_config['model_dir']}')")
        self.add_argument('--model-manifest', default=_default_global_config['model_manifest'], help=f"sets the path to the model manifest file (default '{_default_global_config['model_manifest']}')")
        self.add_argument('--list-models', action='store_true', help='lists the available models (from $model_dir/manifest.json)')
        self.add_argument('--default-backend', default=_default_global_config['default_backend'], help=f"sets the default backend to use for model execution (default '{_default_global_config['default_backend']}')")
        self.add_argument('--profile', action='store_true', help='enables model performance profiling')
        self.add_argument('--verbose', action='store_true', help='sets the logging level to verbose')
        self.add_argument('--debug', action='store_true', help='sets the logging level to debug')
        
        log_levels = ['debug', 'verbose', 'info', 'warning', 'error', 'critical']
        
        self.add_argument('--log-level', default=_default_global_config['log_level'], type=str, choices=log_levels,
                          help=f"sets the logging level to one of the options above (default={_default_global_config['log_level']})")
        
    def parse_args(self, *args, **kwargs):
        args = super(ConfigArgParser, self).parse_args(*args, **kwargs)
        
        global_config.log_level = args.log_level
        global_config.model_dir = args.model_dir
        
        global_config.model_manifest = args.model_manifest
        global_config.default_backend = args.default_backend
        
        if args.profile:
            global_config.profile = True
            
        if args.verbose:
            global_config.log_level = 'verbose'
            
        if args.debug:
            global_config.log_level = 'debug'
        
        if args.global_config:
            global_config.load(args.global_config)
            
        if args.list_models:
            from .resource import list_models
            list_models()
            
        logging.debug(f'global config:\n{global_config}')    
        return args


================================================
FILE: jetson_voice/utils/resource.py
================================================
#!/usr/bin/env python3
# coding: utf-8

import os
import json
import time
import tqdm
import pprint
import logging
import tarfile
import urllib
import importlib

from .config import global_config, ConfigDict


def find_resource(path):
    """
    Find a resource by checking some common paths.
    """
    if os.path.exists(path):
        return path
        
    search_dirs = [global_config.model_dir,
                   os.path.join(global_config.model_dir, 'asr'),
                   os.path.join(global_config.model_dir, 'nlp'),
                   os.path.join(global_config.model_dir, 'tts')]
    
    for search_dir in search_dirs:
        search_path = os.path.join(search_dir, path)
        
        if os.path.exists(search_path):
            return search_path
    
    raise IOError(f"failed to locate resource '{path}'")


def load_resource(resource, factory_map, *args, **kwargs):
    """
    Load an instance of a resource from a config or service name.
    The factory_map dict maps the backend names to class names.
    Returns the resource instance, or the config if factory_map is null.
    """
    if isinstance(resource, str):
        root, ext = os.path.splitext(resource)
        
        if len(ext) > 0:
            ext = ext.lower()
            
            if ext == '.json':
                config = ConfigDict(path=resource)
            elif ext == '.onnx' or ext == '.engine' or ext == '.plan':
                config = ConfigDict(path=root + '.json')
            else:
                raise ValueError(f"resource '{resource}' has invalid extension '{ext}'")
        else:
            manifest = download_model(resource)

            if manifest['type'] == 'model':
                config = ConfigDict(path=get_model_config_path(manifest=manifest))
            else:
                config = ConfigDict(backend=manifest['backend'], type=manifest['name'])
    
    elif isinstance(resource, ConfigDict):
        config = resource
    elif isinstance(resource, dict):
        config = ConfigDict(resource)
    else:
        raise ValueError(f"expected string or dict type, instead got {type(resource).__name__}")
    
    config.setdefault('backend', global_config.default_backend)
    
    if factory_map is None:
        return config
        
    if config.backend not in factory_map:
        raise ValueError(f"'{config.path}' has invalid backend '{config.backend}' (valid options are: {', '.join(factory_map.keys())})")
        
    class_name = factory_map[config.backend].rsplit(".", 1)
    class_type = getattr(importlib.import_module(class_name[0]), class_name[1])
    
    logging.debug(f"creating instance of {factory_map[config.backend]} for '{config.path}' (backend {config.backend})")
    logging.debug(class_type)
    
    return class_type(config, *args, **kwargs)
    
    
def load_model(config, dynamic_shapes=None):
    """
    Loads an ONNX model through a backend (either TensorRT or onnxruntime)
    """
    factory_map = {
        'tensorrt' : 'jetson_voice.backends.tensorrt.TRTModel',
        'onnxruntime' : 'jetson_voice.backends.onnxruntime.OnnxRuntimeModel'
    }
    
    config.setdefault('backend', global_config.default_backend)
    config.setdefault('model_path', os.path.splitext(config.path)[0] + '.onnx')
    
    if not os.path.exists(config.model_path):
        model_path = os.path.join(os.path.dirname(config.path), config.model_path)
        
        if not os.path.exists(model_path):
            raise IOError(f"couldn't find file '{config.model_path}'")
        else:
            config.model_path = model_path

    if config.backend not in factory_map:
        raise ValueError(f"'{config.path}' has invalid backend '{config.backend}' (valid options are: {', '.join(factory_map.keys())})")
        
    class_name = factory_map[config.backend].rsplit(".", 1)
    class_type = getattr(importlib.import_module(class_name[0]), class_name[1])
    
    logging.info(f"loading model '{config.model_path}' with {factory_map[config.backend]}")
    logging.debug(class_type)
    
    return class_type(config, dynamic_shapes=dynamic_shapes)
    
    
def load_models_manifest(path=None):
    """
    Load the models manifest file.
    If the path isn't overriden, it will use the default 'data/networks/manifest.json'
    """
    if path is None:
        path = global_config.model_manifest
        
    with open(path) as file:
        manifest = json.load(file)
        
    for key in manifest:
        manifest[key].setdefault('name', key)
        manifest[key].setdefault('config', key + '.json')
        manifest[key].setdefault('type', 'model')
        
    return manifest
    
  
def find_model_manifest(name):
    """
    Find a model manifest entry by name / alias.
    """
    manifest = load_models_manifest()
    
    for key in manifest:
        if key.lower() == name.lower():
            return manifest[key]
        
        if 'alias' in manifest[key]:
            if isinstance(manifest[key]['alias'], str):
                aliases = [manifest[key]['alias']]
            else:
                aliases = manifest[key]['alias']
                
            for alias in aliases:
                if alias.lower() == name.lower():
                    return manifest[key]
      
    raise ValueError(f"could not find '{name}' in manifest '{global_config.model_manifest}'")
    
 
def download_model(name, max_attempts=10, retry_time=5):
    """
    Download a model if it hasn't already been downloaded.
    """
    manifest = find_model_manifest(name)
    
    if manifest is None:
        return None
      
    if manifest['type'] != 'model':
        return manifest
        
    if os.path.exists(get_model_config_path(manifest=manifest)):
        return manifest

    class DownloadProgressBar(tqdm.tqdm):
        def update_to(self, b=1, bsize=1, tsize=None):
            if tsize is not None:
                self.total = tsize
            self.update(b * bsize - self.n)
        
    def attempt_download(attempt):
        logging.info(f"downloading '{manifest['name']}' from {manifest['url']} (attempt {attempt} of {max_attempts})")

        with DownloadProgressBar(unit='B', unit_scale=True, miniters=1, desc=manifest['name']) as t:
            try:
                filename, _ = urllib.request.urlretrieve(manifest['url'], reporthook=t.update_to)
            except Exception as error:
                t.close()
                logging.error(error)
                return None
                
            return filename
        
    for attempt in range(1, max_attempts+1):
        filename = attempt_download(attempt)
        
        if filename is not None:
            break
            
        logging.error(f"failed to download '{manifest['name']}' from {manifest['url']} (attempt {attempt} of {max_attempts})")
        
        if attempt == max_attempts:
            raise ValueError(f"failed to download '{manifest['name']}' from {manifest['url']} (max attempts exceeded)")
            
        logging.info(f"waiting {retry_time} seconds before trying again...")
        time.sleep(retry_time)
        
    logging.info(f"extracting {filename} to {os.path.join(global_config.model_dir, manifest['domain'], manifest['name'])}")
    
    with tarfile.open(filename, "r:gz") as tar:
        tar.list()
        tar.extractall(path=os.path.join(global_config.model_dir, manifest['domain']))

    os.remove(filename)
    return manifest
        
    
def get_model_config_path(name=None, manifest=None):
    """
    Gets the path to the model config from it's name or manifest entry.
    """
    if name is None and manifest is None:
        raise ValueError('must specify either name or manifest arguments')
        
    if manifest is None:
        manifest = find_model_manifest(name)
        
    if manifest['type'] != 'model':
        raise ValueError(f"resource '{manifest['name']}' is not a model (type='{manifest['type']}')")
    
    if len(os.path.dirname(manifest['config'])) > 0:  # if full path is specified
        return os.path.join(global_config.model_dir, manifest['domain'], manifest['config'])
    else:  
        return os.path.join(global_config.model_dir, manifest['domain'], manifest['name'], manifest['config'])
    
   
def list_models():
    """
    Print out the models available.
    """
    manifest = load_models_manifest()
    
    print('')
    print('----------------------------------------------------')
    print(f" Models")
    print('----------------------------------------------------')

    for key in list(manifest):
        if manifest[key]['type'] != 'model':
            manifest.pop(key)
            
    pprint.pprint(manifest)

    print('')

================================================
FILE: jetson_voice/utils/softmax.py
================================================
#!/usr/bin/env python3
# coding: utf-8

import numpy as np

    
def softmax(x, theta=1.0, axis=None):
    """
    Compute the softmax of each element along an axis of x.

    Parameters
    ----------
      x: ND-Array. Probably should be floats.
    
      theta (optional): float parameter, used as a multiplier
                        prior to exponentiation. Default = 1.0
        
      axis (optional): axis to compute values along. Default is the
                       first non-singleton axis.

    Returns an array the same size as X. The result will sum to 1
    along the specified axis.
    """
    y = np.atleast_2d(x)

    # find axis
    if axis is None:
        axis = next(j[0] for j in enumerate(y.shape) if j[1] > 1)

    # multiply y against the theta parameter,
    y = y * float(theta)

    # subtract the max for numerical stability
    y = y - np.expand_dims(np.max(y, axis = axis), axis)

    # exponentiate y
    y = np.exp(y)

    # take the sum along the specified axis
    ax_sum = np.expand_dims(np.sum(y, axis = axis), axis)

    # finally: divide elementwise
    p = y / ax_sum

    # flatten if X was 1D
    if len(x.shape) == 1: p = p.flatten()

    return p


def normalize_logits(logits):
    """
    Normalize logits such that they are distributed between [0,1]
    """
    return np.exp(logits - np.log(np.sum(np.exp(logits), axis=-1, keepdims=True)))           

              
================================================
FILE: patches/nemo/1.0.0rc1/exportable.original.py
================================================
# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
from abc import ABC
from collections import defaultdict
from enum import Enum
from typing import Dict

import onnx
import torch

from nemo.core.classes import typecheck
from nemo.core.neural_types import AxisKind, NeuralType
from nemo.utils import logging
from nemo.utils.export_utils import replace_for_export

try:
    import onnx_graphsurgeon as gs

    ONNX_GRAPHSURGEON_AVAILABLE = True

except (ImportError, ModuleNotFoundError):
    ONNX_GRAPHSURGEON_AVAILABLE = False

__all__ = ['ExportFormat', 'Exportable']


class ExportFormat(Enum):
    """Which format to use when exporting a Neural Module for deployment"""

    ONNX = (1,)
    TORCHSCRIPT = (2,)


_EXT_DICT = {
    ".pt": ExportFormat.TORCHSCRIPT,
    ".onnx": ExportFormat.ONNX,
}


class Exportable(ABC):
    """
    This Interface should be implemented by particular classes derived from nemo.core.NeuralModule or nemo.core.ModelPT.
    It gives these entities ability to be exported for deployment to formats such as ONNX.
    """

    @staticmethod
    def get_format(filename: str):
        _, ext = os.path.splitext(filename)
        try:
            return _EXT_DICT[ext]
        except KeyError:
            raise ValueError(f"Export file {filename} extension does not correspond to any export format!")

    @property
    def input_module(self):
        return self

    @property
    def output_module(self):
        return self

    def get_input_names(self, input_example):
        if isinstance(input_example, Dict):
            input_names = list(input_example.keys())
        else:
            if not (hasattr(self, 'input_types')):
                raise NotImplementedError(
                    'For export to work you must define input_types or pass names in input_example'
                )
            input_names = list(self.input_types.keys())
        # remove unnecessary inputs for input_ports
        for name in self.disabled_deployment_input_names:
            input_names.remove(name)
        return input_names

    def get_output_names(self, output_example):
        if isinstance(output_example, Dict):
            output_names = list(output_example.keys())
        else:
            if not (hasattr(self, 'output_types')):
                raise NotImplementedError(
                    'For export to work you must define output_types or pass names in output_example'
                )
            output_names = list(self.output_types.keys())
            # remove unnecessary inputs for input_ports
        for name in self.disabled_deployment_output_names:
            output_names.remove(name)
        return output_names

    def get_input_dynamic_axes(self, input_names):
        dynamic_axes = defaultdict(list)
        for name in input_names:
            dynamic_axes = {
                **dynamic_axes,
                **self._extract_dynamic_axes(name, self.input_types[name]),
            }
        return dynamic_axes

    def get_output_dynamic_axes(self, output_names):
        dynamic_axes = defaultdict(list)
        for name in output_names:
            dynamic_axes = {
                **dynamic_axes,
                **self._extract_dynamic_axes(name, self.output_types[name]),
            }
        return dynamic_axes

    def export(
        self,
        output: str,
        input_example=None,
        output_example=None,
        verbose=False,
        export_params=True,
        do_constant_folding=True,
        keep_initializers_as_inputs=False,
        onnx_opset_version: int = 12,
        try_script: bool = False,
        set_eval: bool = True,
        check_trace: bool = True,
        use_dynamic_axes: bool = True,
        dynamic_axes=None,
        check_tolerance=0.01,
        forward_method=None,
    ):

        qual_name = self.__module__ + '.' + self.__class__.__qualname__
        output_descr = qual_name + ' exported to ONNX'
        exported = ([output], [output_descr])

        try:
            # Disable typechecks
            typecheck.set_typecheck_enabled(enabled=False)

            # Allow user to completely override forward method to export
            if forward_method is None and hasattr(type(self), "forward_for_export"):
                forward_method = type(self).forward_for_export

            if forward_method:
                old_forward_method = type(self).forward
                type(self).forward = forward_method

            # Set module to eval mode
            if set_eval:
                self.eval()

            format = self.get_format(output)
            self._prepare_for_export()

            with torch.jit.optimized_execution(True):
                jitted_model = None
                if try_script:
                    try:
                        jitted_model = torch.jit.script(self)
                    except Exception as e:
                        print("jit.script() failed!", e)

            if input_example is None:
                input_example = self.input_module.input_example()

            with torch.jit.optimized_execution(True):
                if format == ExportFormat.TORCHSCRIPT:
                    if isinstance(input_example, Dict):
                        input_example = tuple(input_example.values())

                    if jitted_model is None:
                        jitted_model = torch.jit.trace(
                            self,
                            input_example,
                            strict=False,
                            optimize=True,
                            check_trace=check_trace,
                            check_tolerance=check_tolerance,
                        )
                    jitted_model.save(output)
                    assert os.path.exists(output)

                elif format == ExportFormat.ONNX:
                    if jitted_model is None:
                        jitted_model = self
                    if output_example is None:
                        if isinstance(input_example, tuple):
                            output_example = self.forward(*input_example)
                        else:
                            output_example = self.forward(input_example)

                    input_names = self.input_module.get_input_names(input_example)
                    output_names = self.output_module.get_output_names(output_example)

                    # dynamic axis is a mapping from input/output_name => list of "dynamic" indices
                    if dynamic_axes is None and use_dynamic_axes:
                        dynamic_axes = self.input_module.get_input_dynamic_axes(input_names)
                        dynamic_axes = {**dynamic_axes, **self.output_module.get_output_dynamic_axes(output_names)}

                    if isinstance(input_example, Dict):
                        input_example = tuple(input_example.values())

                    torch.onnx.export(
                        jitted_model,
                        input_example,
                        output,
                        input_names=input_names,
                        output_names=output_names,
                        verbose=verbose,
                        export_params=export_params,
                        do_constant_folding=do_constant_folding,
                        keep_initializers_as_inputs=keep_initializers_as_inputs,
                        dynamic_axes=dynamic_axes,
                        opset_version=onnx_opset_version,
                        example_outputs=output_example,
                    )

                    # Verify the model can be read, and is valid
                    onnx_model = onnx.load(output)
                    onnx.checker.check_model(onnx_model, full_check=True)

                    if do_constant_folding:
                        if not ONNX_GRAPHSURGEON_AVAILABLE:
                            logging.info(
                                f"onnx-graphsurgeon module is not instlled."
                                "That may result in suboptimal optimization of exported ONNX graph (including unneeded DOUBLE initializers)."
                                "Please follow the instructions available at:"
                                "https://github.com/NVIDIA/TensorRT/tree/master/tools/onnx-graphsurgeon"
                                "to install onnx-graphsurgeon from source to improve exported graph."
                            )
                        else:
                            # This pass is to remove/recast certain constants that are generated as 'double'
                            # Those constants break ONNX -> TRT conversion (TRT does not support 'double' as of 7.2)
                            # Can probably be removed once TRT has automatic downcast for double.
                            # However, it may still be useful even then as it seems to always make the graph shorter.
                            graph = gs.import_onnx(onnx_model)
                            onnx_model = gs.export_onnx(graph.fold_constants().cleanup())
                            onnx.checker.check_model(onnx_model, full_check=True)
                            onnx.save(onnx_model, output)
                else:
                    raise ValueError(f'Encountered unknown export format {format}.')
        finally:
            typecheck.set_typecheck_enabled(enabled=True)
            if forward_method:
                type(self).forward = old_forward_method
        return exported

    @property
    def disabled_deployment_input_names(self):
        """Implement this method to return a set of input names disabled for export"""
        return set()

    @property
    def disabled_deployment_output_names(self):
        """Implement this method to return a set of output names disabled for export"""
        return set()

    @property
    def supported_export_formats(self):
        """Implement this method to return a set of export formats supported. Default is all types."""
        return set([ExportFormat.ONNX, ExportFormat.TORCHSCRIPT])

    @staticmethod
    def _extract_dynamic_axes(name: str, ntype: NeuralType):
        """
        Implement this method to provide dynamic axes id for ONNX export.
        By default, this method will extract BATCH and TIME dimension ids from each provided input/output name argument.

        For example, if module/model accepts argument named "input_signal" with type corresponding to [Batch, Time, Dim]
        shape, then the returned result should contain "input_signal" -> [0, 1] because Batch and Time are dynamic axes
        as they can change from call to call during inference.

        Args:
            name: Name of input or output parameter
            ntype: Corresponding Neural Type

        Returns:

        """
        dynamic_axes = defaultdict(list)
        if ntype.axes:
            for ind, axis in enumerate(ntype.axes):
                if axis.kind in [AxisKind.Batch, AxisKind.Time, AxisKind.Width, AxisKind.Height]:
                    dynamic_axes[name].append(ind)
        return dynamic_axes

    def _prepare_for_export(self, replace_1D_2D=False):
        """
        Override this method to prepare module for export. This is in-place operation.
        Base version does common necessary module replacements (Apex etc)
        """
        replace_for_export(self, replace_1D_2D)


================================================
FILE: patches/nemo/1.0.0rc1/exportable.py
================================================
# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
from abc import ABC
from collections import defaultdict
from enum import Enum
from typing import Dict

import onnx
import torch

from nemo.core.classes import typecheck
from nemo.core.neural_types import AxisKind, NeuralType
from nemo.utils import logging
from nemo.utils.export_utils import replace_for_export

try:
    import onnx_graphsurgeon as gs

    ONNX_GRAPHSURGEON_AVAILABLE = True

except (ImportError, ModuleNotFoundError):
    ONNX_GRAPHSURGEON_AVAILABLE = False

__all__ = ['ExportFormat', 'Exportable']


class ExportFormat(Enum):
    """Which format to use when exporting a Neural Module for deployment"""

    ONNX = (1,)
    TORCHSCRIPT = (2,)


_EXT_DICT = {
    ".pt": ExportFormat.TORCHSCRIPT,
    ".onnx": ExportFormat.ONNX,
}


class Exportable(ABC):
    """
    This Interface should be implemented by particular classes derived from nemo.core.NeuralModule or nemo.core.ModelPT.
    It gives these entities ability to be exported for deployment to formats such as ONNX.
    """

    @staticmethod
    def get_format(filename: str):
        _, ext = os.path.splitext(filename)
        try:
            return _EXT_DICT[ext]
        except KeyError:
            raise ValueError(f"Export file {filename} extension does not correspond to any export format!")

    @property
    def input_module(self):
        return self

    @property
    def output_module(self):
        return self

    def get_input_names(self, input_example):
        if isinstance(input_example, Dict):
            input_names = list(input_example.keys())
        else:
            if not (hasattr(self, 'input_types')):
                raise NotImplementedError(
                    'For export to work you must define input_types or pass names in input_example'
                )
            input_names = list(self.input_types.keys())
        # remove unnecessary inputs for input_ports
        for name in self.disabled_deployment_input_names:
            input_names.remove(name)
        return input_names

    def get_output_names(self, output_example):
        if isinstance(output_example, Dict):
            output_names = list(output_example.keys())
        else:
            if not (hasattr(self, 'output_types')):
                raise NotImplementedError(
                    'For export to work you must define output_types or pass names in output_example'
                )
            output_names = list(self.output_types.keys())
            # remove unnecessary inputs for input_ports
        for name in self.disabled_deployment_output_names:
            output_names.remove(name)
        return output_names

    def get_input_dynamic_axes(self, input_names):
        dynamic_axes = defaultdict(list)
        for name in input_names:
            dynamic_axes = {
                **dynamic_axes,
                **self._extract_dynamic_axes(name, self.input_types[name]),
            }
        return dynamic_axes

    def get_output_dynamic_axes(self, output_names):
        dynamic_axes = defaultdict(list)
        for name in output_names:
            dynamic_axes = {
                **dynamic_axes,
                **self._extract_dynamic_axes(name, self.output_types[name]),
            }
        return dynamic_axes

    def export(
        self,
        output: str,
        input_example=None,
        output_example=None,
        verbose=False,
        export_params=True,
        do_constant_folding=True,
        keep_initializers_as_inputs=False,
        onnx_opset_version: int = 12,
        try_script: bool = False,
        set_eval: bool = True,
        check_trace: bool = True,
        use_dynamic_axes: bool = True,
        dynamic_axes=None,
        check_tolerance=0.01,
        forward_method=None,
    ):

        qual_name = self.__module__ + '.' + self.__class__.__qualname__
        output_descr = qual_name + ' exported to ONNX'
        exported = ([output], [output_descr])

        try:
            # Disable typechecks
            typecheck.set_typecheck_enabled(enabled=False)

            # Allow user to completely override forward method to export
            if forward_method is None and hasattr(type(self), "forward_for_export"):
                forward_method = type(self).forward_for_export

            if forward_method:
                old_forward_method = type(self).forward
                type(self).forward = forward_method

            # Set module to eval mode
            if set_eval:
                self.eval()

            format = self.get_format(output)
            self._prepare_for_export()

            with torch.jit.optimized_execution(True):
                jitted_model = None
                if try_script:
                    try:
                        jitted_model = torch.jit.script(self)
                    except Exception as e:
                        print("jit.script() failed!", e)

            if input_example is None:
                input_example = self.input_module.input_example()

            with torch.jit.optimized_execution(True):
                if format == ExportFormat.TORCHSCRIPT:
                    if isinstance(input_example, Dict):
                        input_example = tuple(input_example.values())

                    if jitted_model is None:
                        jitted_model = torch.jit.trace(
                            self,
                            input_example,
                            strict=False,
                            optimize=True,
                            check_trace=check_trace,
                            check_tolerance=check_tolerance,
                        )
                    jitted_model.save(output)
                    assert os.path.exists(output)

                elif format == ExportFormat.ONNX:
                    if jitted_model is None:
                        jitted_model = self
                    if output_example is None:
                        if isinstance(input_example, tuple):
                            output_example = self.forward(*input_example)
                        else:
                            output_example = self.forward(input_example)

                    input_names = self.input_module.get_input_names(input_example)
                    output_names = self.output_module.get_output_names(output_example)

                    # dynamic axis is a mapping from input/output_name => list of "dynamic" indices
                    if dynamic_axes is None and use_dynamic_axes:
                        dynamic_axes = self.input_module.get_input_dynamic_axes(input_names)
                        dynamic_axes = {**dynamic_axes, **self.output_module.get_output_dynamic_axes(output_names)}

                    if isinstance(input_example, tuple):
                        logging.info(f'ONNX input_example {len(input_example)}')
                        
                        for idx, x in enumerate(input_example):
                            logging.info(f'  - {idx}  {x.shape}')
                            
                        """
                        if len(input_names) < len(input_example):
                            logging.warning(f'removing extra input_examples to match number of input_names')
                            input_example = tuple([input_example[x] for x in range(len(input_names))])
                            logging.warning(f'new number of input_examples:  {len(input_example)}')
                        """
                        
                    logging.info(f'ONNX class_name    {type(self).__name__}')
                    logging.info(f'ONNX input_names   {input_names}')
                    logging.info(f'ONNX output_names  {output_names}')
                    logging.info(f'ONNX dynamic_axes  {dynamic_axes}')

                    if isinstance(input_example, Dict):
                        input_example = tuple(input_example.values())

                    torch.onnx.export(
                        jitted_model,
                        input_example,
                        output,
                        input_names=input_names,
                        output_names=output_names,
                        verbose=verbose,
                        export_params=export_params,
                        do_constant_folding=do_constant_folding,
                        keep_initializers_as_inputs=keep_initializers_as_inputs,
                        dynamic_axes=dynamic_axes,
                        opset_version=onnx_opset_version,
                        example_outputs=output_example,
                    )

                    # Verify the model can be read, and is valid
                    onnx_model = onnx.load(output)
                    onnx.checker.check_model(onnx_model, full_check=True)

                    if do_constant_folding:
                        if not ONNX_GRAPHSURGEON_AVAILABLE:
                            logging.info(
                                f"onnx-graphsurgeon module is not instlled."
                                "That may result in suboptimal optimization of exported ONNX graph (including unneeded DOUBLE initializers)."
                                "Please follow the instructions available at:"
                                "https://github.com/NVIDIA/TensorRT/tree/master/tools/onnx-graphsurgeon"
                                "to install onnx-graphsurgeon from source to improve exported graph."
                            )
                        else:
                            # This pass is to remove/recast certain constants that are generated as 'double'
                            # Those constants break ONNX -> TRT conversion (TRT does not support 'double' as of 7.2)
                            # Can probably be removed once TRT has automatic downcast for double.
                            # However, it may still be useful even then as it seems to always make the graph shorter.
                            graph = gs.import_onnx(onnx_model)
                            onnx_model = gs.export_onnx(graph.fold_constants().cleanup())
                            onnx.checker.check_model(onnx_model, full_check=True)
                            onnx.save(onnx_model, output)
                else:
                    raise ValueError(f'Encountered unknown export format {format}.')
        finally:
            typecheck.set_typecheck_enabled(enabled=True)
            if forward_method:
                type(self).forward = old_forward_method
        return exported

    @property
    def disabled_deployment_input_names(self):
        """Implement this method to return a set of input names disabled for export"""
        return set()

    @property
    def disabled_deployment_output_names(self):
        """Implement this method to return a set of output names disabled for export"""
        return set()

    @property
    def supported_export_formats(self):
        """Implement this method to return a set of export formats supported. Default is all types."""
        return set([ExportFormat.ONNX, ExportFormat.TORCHSCRIPT])

    @staticmethod
    def _extract_dynamic_axes(name: str, ntype: NeuralType):
        """
        Implement this method to provide dynamic axes id for ONNX export.
        By default, this method will extract BATCH and TIME dimension ids from each provided input/output name argument.

        For example, if module/model accepts argument named "input_signal" with type corresponding to [Batch, Time, Dim]
        shape, then the returned result should contain "input_signal" -> [0, 1] because Batch and Time are dynamic axes
        as they can change from call to call during inference.

        Args:
            name: Name of input or output parameter
            ntype: Corresponding Neural Type

        Returns:

        """
        dynamic_axes = defaultdict(list)
        if ntype.axes:
            for ind, axis in enumerate(ntype.axes):
                if axis.kind in [AxisKind.Batch, AxisKind.Time, AxisKind.Width, AxisKind.Height]:
                    dynamic_axes[name].append(ind)
        return dynamic_axes

    def _prepare_for_export(self, replace_1D_2D=False):
        """
        Override this method to prepare module for export. This is in-place operation.
        Base version does common necessary module replacements (Apex etc)
        """
        replace_for_export(self, replace_1D_2D)


================================================
FILE: patches/nemo/1.0.0rc1/nlp/__init__.py
================================================
# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from nemo.collections.nlp.modules.common.huggingface.albert import AlbertEncoder
from nemo.collections.nlp.modules.common.huggingface.bert import BertEncoder
from nemo.collections.nlp.modules.common.huggingface.distilbert import DistilBertEncoder
from nemo.collections.nlp.modules.common.huggingface.huggingface_utils import (
    get_huggingface_lm_model,
    get_huggingface_pretrained_lm_models_list,
)
from nemo.collections.nlp.modules.common.huggingface.roberta import RobertaEncoder
from nemo.collections.nlp.modules.common.huggingface.mobilebert import MobileBertEncoder


================================================
FILE: patches/nemo/1.0.0rc1/nlp/distilbert.diff
================================================
17a18
> from typing import Dict, Optional
19a21
> from nemo.core.neural_types import ChannelType, MaskType, NeuralType
29a32,53
>     @property
>     def input_types(self) -> Optional[Dict[str, NeuralType]]:
>         """
>         These are ordered incorrectly in bert_module.py WRT to QAModel.forward()
>         DistilBert doesn't use token_type_ids, but the QAModel still needs them during export.
>         By re-ordring them, the correct input_names are used during export of the ONNX model.
>         """
>         return {
>             "input_ids": NeuralType(('B', 'T'), ChannelType()),
>             "token_type_ids": NeuralType(('B', 'T'), ChannelType(), optional=True),
>             "attention_mask": NeuralType(('B', 'T'), MaskType(), optional=True)
>         }
> 
>     '''
>     # note:  disabling the token_type_ids here still leads to incorrect names, because QAModel.forward()
>     #        still needs the token_type_ids to run the trace, and hence the input_example is still larger
>     @property
>     def disabled_deployment_input_names(self):
>         """Implement this method to return a set of input names disabled for export"""
>         return ['token_type_ids']
>     '''
>     
34a59
>         
\ No newline at end of file


================================================
FILE: patches/nemo/1.0.0rc1/nlp/distilbert.original.py
================================================
# Copyright 2020 The Google AI Language Team Authors and
# The HuggingFace Inc. team.
# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from transformers import DistilBertModel

from nemo.collections.nlp.modules.common.bert_module import BertModule
from nemo.core.classes import typecheck

__all__ = ['DistilBertEncoder']


class DistilBertEncoder(DistilBertModel, BertModule):
    """
    Wraps around the Huggingface transformers implementation repository for easy use within NeMo.
    """

    @typecheck()
    def forward(self, input_ids, attention_mask, token_type_ids=None):
        # distilBert does not use token_type_ids as the most of the other Bert models
        res = super().forward(input_ids=input_ids, attention_mask=attention_mask)[0]
        return res


================================================
FILE: patches/nemo/1.0.0rc1/nlp/distilbert.py
================================================
# Copyright 2020 The Google AI Language Team Authors and
# The HuggingFace Inc. team.
# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from transformers import DistilBertModel
from typing import Dict, Optional

from nemo.collections.nlp.modules.common.bert_module import BertModule
from nemo.core.neural_types import ChannelType, MaskType, NeuralType
from nemo.core.classes import typecheck

__all__ = ['DistilBertEncoder']


class DistilBertEncoder(DistilBertModel, BertModule):
    """
    Wraps around the Huggingface transformers implementation repository for easy use within NeMo.
    """

    @property
    def input_types(self) -> Optional[Dict[str, NeuralType]]:
        """
        These are ordered incorrectly in bert_module.py WRT to QAModel.forward()
        DistilBert doesn't use token_type_ids, but the QAModel still needs them during export.
        By re-ordring them, the correct input_names are used during export of the ONNX model.
        """
        return {
            "input_ids": NeuralType(('B', 'T'), ChannelType()),
            "token_type_ids": NeuralType(('B', 'T'), ChannelType(), optional=True),
            "attention_mask": NeuralType(('B', 'T'), MaskType(), optional=True)
        }

    '''
    # note:  disabling the token_type_ids here still leads to incorrect names, because QAModel.forward()
    #        still needs the token_type_ids to run the trace, and hence the input_example is still larger
    @property
    def disabled_deployment_input_names(self):
        """Implement this method to return a set of input names disabled for export"""
        return ['token_type_ids']
    '''
    
    @typecheck()
    def forward(self, input_ids, attention_mask, token_type_ids=None):
        # distilBert does not use token_type_ids as the most of the other Bert models
        res = super().forward(input_ids=input_ids, attention_mask=attention_mask)[0]
        return res
        

================================================
FILE: patches/nemo/1.0.0rc1/nlp/huggingface_utils.py
================================================
# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os
from typing import List, Optional

from transformers import (
    ALBERT_PRETRAINED_MODEL_ARCHIVE_LIST,
    ALL_PRETRAINED_CONFIG_ARCHIVE_MAP,
    BERT_PRETRAINED_MODEL_ARCHIVE_LIST,
    DISTILBERT_PRETRAINED_MODEL_ARCHIVE_LIST,
    ROBERTA_PRETRAINED_MODEL_ARCHIVE_LIST,
    MOBILEBERT_PRETRAINED_MODEL_ARCHIVE_LIST,
    AlbertConfig,
    AutoModel,
    BertConfig,
    DistilBertConfig,
    RobertaConfig,
    MobileBertConfig,
)

from nemo.collections.nlp.modules.common.huggingface.albert import AlbertEncoder
from nemo.collections.nlp.modules.common.huggingface.bert import BertEncoder
from nemo.collections.nlp.modules.common.huggingface.distilbert import DistilBertEncoder
from nemo.collections.nlp.modules.common.huggingface.roberta import RobertaEncoder
from nemo.collections.nlp.modules.common.huggingface.mobilebert import MobileBertEncoder
from nemo.utils import logging

__all__ = ["get_huggingface_lm_model", "get_huggingface_pretrained_lm_models_list"]


HUGGINGFACE_MODELS = {
    "BertModel": {
        "default": "bert-base-uncased",
        "class": BertEncoder,
        "config": BertConfig,
        "pretrained_model_list": BERT_PRETRAINED_MODEL_ARCHIVE_LIST,
    },
    "DistilBertModel": {
        "default": "distilbert-base-uncased",
        "class": DistilBertEncoder,
        "config": DistilBertConfig,
        "pretrained_model_list": DISTILBERT_PRETRAINED_MODEL_ARCHIVE_LIST,
    },
    "RobertaModel": {
        "default": "roberta-base",
        "class": RobertaEncoder,
        "config": RobertaConfig,
        "pretrained_model_list": ROBERTA_PRETRAINED_MODEL_ARCHIVE_LIST,
    },
    "AlbertModel": {
        "default": "albert-base-v2",
        "class": AlbertEncoder,
        "config": AlbertConfig,
        "pretrained_model_list": ALBERT_PRETRAINED_MODEL_ARCHIVE_LIST,
    },
    "MobileBertModel": {
        "default": "google/mobilebert-uncased",
        "class": MobileBertEncoder,
        "config": MobileBertConfig,
        "pretrained_model_list": MOBILEBERT_PRETRAINED_MODEL_ARCHIVE_LIST,
    },
}


def get_huggingface_lm_model(
    pretrained_model_name: str, config_dict: Optional[dict] = None, config_file: Optional[str] = None,
):
    """
    Returns lm model instantiated with Huggingface

    Args:
        pretrained_mode_name: specify this to instantiate pretrained model from Huggingface,
            e.g. bert-base-cased. For entire list, see get_huggingface_pretrained_lm_models_list().
        config_dict: model configuration dictionary used to instantiate Huggingface model from scratch
        config_file: path to model configuration file used to instantiate Huggingface model from scratch

    Returns:
        BertModule
    """

    try:
        automodel = AutoModel.from_pretrained(pretrained_model_name)
    except Exception as e:
        raise ValueError(f"{pretrained_model_name} is not supported by HuggingFace. {e}")

    model_type = type(automodel).__name__
    if model_type in HUGGINGFACE_MODELS:
        model_class = HUGGINGFACE_MODELS[model_type]["class"]
        if config_file:
            if not os.path.exists(config_file):
                logging.warning(
                    f"Config file was not found at {config_file}. Will attempt to use config_dict or pretrained_model_name."
                )
            else:
                config_class = HUGGINGFACE_MODELS[model_type]["config"]
                return model_class(config_class.from_json_file(config_file))
        if config_dict:
            config_class = HUGGINGFACE_MODELS[model_type]["config"]
            return model_class(config=config_class(**config_dict))
        else:
            return model_class.from_pretrained(pretrained_model_name)
    else:
        raise ValueError(f"Use HuffingFace API directly in NeMo for {pretrained_model_name}")


def get_huggingface_pretrained_lm_models_list(include_external: bool = False,) -> List[str]:
    """
    Returns the list of pretrained HuggingFace language models
    
    Args:
        include_external if true includes all HuggingFace model names, not only those supported language models in NeMo.
    
    Returns the list of HuggingFace models
    """

    huggingface_models = []
    if include_external:
        huggingface_models = list(ALL_PRETRAINED_CONFIG_ARCHIVE_MAP.keys())
    else:
        for model in HUGGINGFACE_MODELS:
            model_names = HUGGINGFACE_MODELS[model]["pretrained_model_list"]
            huggingface_models.extend(model_names)
    return huggingface_models


================================================
FILE: patches/nemo/1.0.0rc1/nlp/location.txt
================================================
nemo/collections/nlp/modules/common/huggingface

Main branch. Commit 21a17b267fac68d4cdd20f3969a580a0a40dbdb4

================================================
FILE: patches/nemo/1.0.0rc1/nlp/mobilebert.py
================================================
# Copyright 2018 The Google AI Language Team Authors and
# The HuggingFace Inc. team.
# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from transformers import MobileBertModel

from nemo.collections.nlp.modules.common.bert_module import BertModule
from nemo.core.classes import typecheck

__all__ = ['MobileBertEncoder']


class MobileBertEncoder(MobileBertModel, BertModule):
    """
    Wraps around the Huggingface transformers implementation repository for easy use within NeMo.
    """

    @typecheck()
    def forward(self, input_ids, attention_mask, token_type_ids):
        res = super().forward(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)[0]
        return res


================================================
FILE: patches/nemo/1.0.0rc1/setup.original.py
================================================
# ! /usr/bin/python
# -*- coding: utf-8 -*-

# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Setup for pip package."""

import codecs
import os
import subprocess
import sys
from distutils import cmd as distutils_cmd
from distutils import log as distutils_log
from itertools import chain

import setuptools


def is_build_action():
    if len(sys.argv) <= 1:
        return False

    BUILD_TOKENS = ["egg_info", "dist", "bdist", "sdist", "install", "build", "develop", "style", "clean"]

    if any([sys.argv[1].startswith(x) for x in BUILD_TOKENS]):
        return True
    else:
        return False


if is_build_action():
    os.environ['NEMO_PACKAGE_BUILDING'] = 'True'

from nemo.package_info import (
    __contact_emails__,
    __contact_names__,
    __description__,
    __download_url__,
    __homepage__,
    __keywords__,
    __license__,
    __package_name__,
    __repository_url__,
    __version__,
)

if os.path.exists('nemo/README.md'):
    with open("nemo/README.md", "r") as fh:
        long_description = fh.read()
    long_description_content_type = "text/markdown"

elif os.path.exists('README.rst'):
    # codec is used for consistent encoding
    long_description = codecs.open(
        os.path.join(os.path.abspath(os.path.dirname(__file__)), 'README.rst'), 'r', 'utf-8',
    ).read()
    long_description_content_type = "text/x-rst"

else:
    long_description = 'See ' + __homepage__


###############################################################################
#                             Dependency Loading                              #
# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% #


def req_file(filename, folder="requirements"):
    with open(os.path.join(folder, filename)) as f:
        content = f.readlines()
    # you may also want to remove whitespace characters
    # Example: `\n` at the end of each line
    return [x.strip() for x in content]


install_requires = req_file("requirements.txt")

extras_require = {
    # User packages
    'test': req_file("requirements_test.txt"),
    # Collections Packages
    'asr': req_file("requirements_asr.txt"),
    'cv': req_file("requirements_cv.txt"),
    'nlp': req_file("requirements_nlp.txt"),
    'tts': req_file("requirements_tts.txt"),
}

extras_require['all'] = list(chain(extras_require.values()))

# TTS depends on ASR
extras_require['tts'] = list(chain([extras_require['tts'], extras_require['asr']]))

tests_requirements = extras_require["test"]

########################## VERSION MISMATCH PATCH #############################
# REMOVE AFTER 21.03 Container is released !

try:
    import torch

    version = torch.__version__
    SUPPORTED_TORCH_VERSION = f"torch=={version}"

    if 'a' in version or 'b' in version:
        # It is githash release, force to supported Pytorch Lightning branch
        SUPPORTED_PYTORCH_LIGHTNING = "pytorch-lightning==1.1.5"
    else:
        # Downgrade torch, pytorch-lightning
        SUPPORTED_TORCH_VERSION = "torch<=1.7.1"
        SUPPORTED_PYTORCH_LIGHTNING = "pytorch-lightning==1.1.5"

except (ImportError, ModuleNotFoundError):
    # Since no torch is installed, pip install torch will install latest torch and latest pytorch lightning
    SUPPORTED_TORCH_VERSION = "torch<=1.7.1"
    SUPPORTED_PYTORCH_LIGHTNING = "pytorch-lightning==1.1.5"

install_requires_buffer = []
for ix, line in enumerate(install_requires):
    if 'lightning' in line:
        install_requires_buffer.append(SUPPORTED_PYTORCH_LIGHTNING)
    elif 'torch' in line:
        install_requires_buffer.append(SUPPORTED_TORCH_VERSION)

        # Pytorch 1.7.1 must use torchtext==0.8.0, torchaudio==0.7.2 and torchvision==0.8.2
        if SUPPORTED_TORCH_VERSION == "torch<=1.7.1":
            install_requires_buffer.append("torchvision==0.8.2")
            install_requires_buffer.append("torchaudio==0.7.2")
            install_requires_buffer.append("torchtext==0.8.0")

    else:
        install_requires_buffer.append(line)

# override install requires
install_requires = install_requires_buffer

###############################################################################
#                            Code style checkers                              #
# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% #


class StyleCommand(distutils_cmd.Command):
    __LINE_WIDTH = 119
    __ISORT_BASE = (
        'isort '
        # These two lines makes isort compatible with black.
        '--multi-line=3 --trailing-comma --force-grid-wrap=0 '
        f'--use-parentheses --line-width={__LINE_WIDTH} -rc -ws'
    )
    __BLACK_BASE = f'black --skip-string-normalization --line-length={__LINE_WIDTH}'
    description = 'Checks overall project code style.'
    user_options = [
        ('scope=', None, 'Folder of file to operate within.'),
        ('fix', None, 'True if tries to fix issues in-place.'),
    ]

    def __call_checker(self, base_command, scope, check):
        command = list(base_command)

        command.append(scope)

        if check:
            command.extend(['--check', '--diff'])

        self.announce(
            msg='Running command: %s' % str(' '.join(command)), level=distutils_log.INFO,
        )

        return_code = subprocess.call(command)

        return return_code

    def _isort(self, scope, check):
        return self.__call_checker(base_command=self.__ISORT_BASE.split(), scope=scope, check=check,)

    def _black(self, scope, check):
        return self.__call_checker(base_command=self.__BLACK_BASE.split(), scope=scope, check=check,)

    def _pass(self):
        self.announce(msg='\033[32mPASS\x1b[0m', level=distutils_log.INFO)

    def _fail(self):
        self.announce(msg='\033[31mFAIL\x1b[0m', level=distutils_log.INFO)

    # noinspection PyAttributeOutsideInit
    def initialize_options(self):
        self.scope = '.'
        self.fix = ''

    def run(self):
        scope, check = self.scope, not self.fix
        isort_return = self._isort(scope=scope, check=check)
        black_return = self._black(scope=scope, check=check)

        if isort_return == 0 and black_return == 0:
            self._pass()
        else:
            self._fail()
            exit(isort_return if isort_return != 0 else black_return)

    def finalize_options(self):
        pass


###############################################################################

setuptools.setup(
    name=__package_name__,
    # Versions should comply with PEP440.  For a discussion on single-sourcing
    # the version across setup.py and the project code, see
    # https://packaging.python.org/en/latest/single_source_version.html
    version=__version__,
    description=__description__,
    long_description=long_description,
    long_description_content_type=long_description_content_type,
    # The project's main homepage.
    url=__repository_url__,
    download_url=__download_url__,
    # Author details
    author=__contact_names__,
    author_email=__contact_emails__,
    # maintainer Details
    maintainer=__contact_names__,
    maintainer_email=__contact_emails__,
    # The licence under which the project is released
    license=__license__,
    classifiers=[
        # How mature is this project? Common values are
        #  1 - Planning
        #  2 - Pre-Alpha
        #  3 - Alpha
        #  4 - Beta
        #  5 - Production/Stable
        #  6 - Mature
        #  7 - Inactive
        'Development Status :: 4 - Beta',
        # Indicate who your project is intended for
        'Intended Audience :: Developers',
        'Intended Audience :: Science/Research',
        'Intended Audience :: Information Technology',
        # Indicate what your project relates to
        'Topic :: Scientific/Engineering',
        'Topic :: Scientific/Engineering :: Mathematics',
        'Topic :: Scientific/Engineering :: Image Recognition',
        'Topic :: Scientific/Engineering :: Artificial Intelligence',
        'Topic :: Software Development :: Libraries',
        'Topic :: Software Development :: Libraries :: Python Modules',
        'Topic :: Utilities',
        # Pick your license as you wish (should match "license" above)
        'License :: OSI Approved :: Apache Software License',
        # Supported python versions
        'Programming Language :: Python :: 3',
        'Programming Language :: Python :: 3.5',
        'Programming Language :: Python :: 3.6',
        'Programming Language :: Python :: 3.7',
        'Programming Language :: Python :: 3.8',
        # Additional Setting
        'Environment :: Console',
        'Natural Language :: English',
        'Operating System :: OS Independent',
    ],
    packages=setuptools.find_packages(),
    install_requires=install_requires,
    setup_requires=['pytest-runner'],
    tests_require=tests_requirements,
    # List additional groups of dependencies here (e.g. development
    # dependencies). You can install these using the following syntax,
    # $ pip install -e ".[all]"
    # $ pip install nemo_toolkit[all]
    extras_require=extras_require,
    # Add in any packaged data.
    include_package_data=True,
    zip_safe=False,
    # PyPI package information.
    keywords=__keywords__,
    # Custom commands.
    cmdclass={'style': StyleCommand},
)


================================================
FILE: patches/nemo/1.0.0rc1/setup.py
================================================
# ! /usr/bin/python
# -*- coding: utf-8 -*-

# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Setup for pip package."""

import codecs
import os
import subprocess
import sys
from distutils import cmd as distutils_cmd
from distutils import log as distutils_log
from itertools import chain

import setuptools


def is_build_action():
    if len(sys.argv) <= 1:
        return False

    BUILD_TOKENS = ["egg_info", "dist", "bdist", "sdist", "install", "build", "develop", "style", "clean"]

    if any([sys.argv[1].startswith(x) for x in BUILD_TOKENS]):
        return True
    else:
        return False


if is_build_action():
    os.environ['NEMO_PACKAGE_BUILDING'] = 'True'

from nemo.package_info import (
    __contact_emails__,
    __contact_names__,
    __description__,
    __download_url__,
    __homepage__,
    __keywords__,
    __license__,
    __package_name__,
    __repository_url__,
    __version__,
)

if os.path.exists('nemo/README.md'):
    with open("nemo/README.md", "r") as fh:
        long_description = fh.read()
    long_description_content_type = "text/markdown"

elif os.path.exists('README.rst'):
    # codec is used for consistent encoding
    long_description = codecs.open(
        os.path.join(os.path.abspath(os.path.dirname(__file__)), 'README.rst'), 'r', 'utf-8',
    ).read()
    long_description_content_type = "text/x-rst"

else:
    long_description = 'See ' + __homepage__


###############################################################################
#                             Dependency Loading                              #
# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% #


def req_file(filename, folder="requirements"):
    with open(os.path.join(folder, filename)) as f:
        content = f.readlines()
    # you may also want to remove whitespace characters
    # Example: `\n` at the end of each line
    return [x.strip() for x in content]


install_requires = req_file("requirements.txt")

extras_require = {
    # User packages
    'test': req_file("requirements_test.txt"),
    # Collections Packages
    'asr': req_file("requirements_asr.txt"),
    'cv': req_file("requirements_cv.txt"),
    'nlp': req_file("requirements_nlp.txt"),
    'tts': req_file("requirements_tts.txt"),
}

extras_require['all'] = list(chain(extras_require.values()))

# TTS depends on ASR
extras_require['tts'] = list(chain([extras_require['tts'], extras_require['asr']]))

tests_requirements = extras_require["test"]

########################## VERSION MISMATCH PATCH #############################
# REMOVE AFTER 21.03 Container is released !

try:
    import torch

    version = torch.__version__
    SUPPORTED_TORCH_VERSION = f"torch=={version}"

    if 'a' in version or 'b' in version:
        # It is githash release, force to supported Pytorch Lightning branch
        SUPPORTED_PYTORCH_LIGHTNING = "pytorch-lightning==1.1.5"
    else:
        # Downgrade torch, pytorch-lightning
        SUPPORTED_TORCH_VERSION = "torch<=1.7.1"
        SUPPORTED_PYTORCH_LIGHTNING = "pytorch-lightning==1.1.5"

except (ImportError, ModuleNotFoundError):
    # Since no torch is installed, pip install torch will install latest torch and latest pytorch lightning
    SUPPORTED_TORCH_VERSION = "torch<=1.7.1"
    SUPPORTED_PYTORCH_LIGHTNING = "pytorch-lightning==1.1.5"

install_requires_buffer = []
for ix, line in enumerate(install_requires):
    if 'lightning' in line:
        install_requires_buffer.append(SUPPORTED_PYTORCH_LIGHTNING)
    elif 'torch' in line:
        install_requires_buffer.append(SUPPORTED_TORCH_VERSION)

        # Pytorch 1.7.1 must use torchtext==0.8.0, torchaudio==0.7.2 and torchvision==0.8.2
        if SUPPORTED_TORCH_VERSION == "torch<=1.7.1":
            install_requires_buffer.append("torchvision") #"torchvision==0.8.2") # when we built from src in the container, it has a slightly different versions of these torch libraries
            install_requires_buffer.append("torchaudio") #"torchaudio==0.7.2")
            install_requires_buffer.append("torchtext") #"torchtext==0.8.0") 

    else:
        install_requires_buffer.append(line)

# override install requires
install_requires = install_requires_buffer

###############################################################################
#                            Code style checkers                              #
# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% #


class StyleCommand(distutils_cmd.Command):
    __LINE_WIDTH = 119
    __ISORT_BASE = (
        'isort '
        # These two lines makes isort compatible with black.
        '--multi-line=3 --trailing-comma --force-grid-wrap=0 '
        f'--use-parentheses --line-width={__LINE_WIDTH} -rc -ws'
    )
    __BLACK_BASE = f'black --skip-string-normalization --line-length={__LINE_WIDTH}'
    description = 'Checks overall project code style.'
    user_options = [
        ('scope=', None, 'Folder of file to operate within.'),
        ('fix', None, 'True if tries to fix issues in-place.'),
    ]

    def __call_checker(self, base_command, scope, check):
        command = list(base_command)

        command.append(scope)

        if check:
            command.extend(['--check', '--diff'])

        self.announce(
            msg='Running command: %s' % str(' '.join(command)), level=distutils_log.INFO,
        )

        return_code = subprocess.call(command)

        return return_code

    def _isort(self, scope, check):
        return self.__call_checker(base_command=self.__ISORT_BASE.split(), scope=scope, check=check,)

    def _black(self, scope, check):
        return self.__call_checker(base_command=self.__BLACK_BASE.split(), scope=scope, check=check,)

    def _pass(self):
        self.announce(msg='\033[32mPASS\x1b[0m', level=distutils_log.INFO)

    def _fail(self):
        self.announce(msg='\033[31mFAIL\x1b[0m', level=distutils_log.INFO)

    # noinspection PyAttributeOutsideInit
    def initialize_options(self):
        self.scope = '.'
        self.fix = ''

    def run(self):
        scope, check = self.scope, not self.fix
        isort_return = self._isort(scope=scope, check=check)
        black_return = self._black(scope=scope, check=check)

        if isort_return == 0 and black_return == 0:
            self._pass()
        else:
            self._fail()
            exit(isort_return if isort_return != 0 else black_return)

    def finalize_options(self):
        pass


###############################################################################

setuptools.setup(
    name=__package_name__,
    # Versions should comply with PEP440.  For a discussion on single-sourcing
    # the version across setup.py and the project code, see
    # https://packaging.python.org/en/latest/single_source_version.html
    version=__version__,
    description=__description__,
    long_description=long_description,
    long_description_content_type=long_description_content_type,
    # The project's main homepage.
    url=__repository_url__,
    download_url=__download_url__,
    # Author details
    author=__contact_names__,
    author_email=__contact_emails__,
    # maintainer Details
    maintainer=__contact_names__,
    maintainer_email=__contact_emails__,
    # The licence under which the project is released
    license=__license__,
    classifiers=[
        # How mature is this project? Common values are
        #  1 - Planning
        #  2 - Pre-Alpha
        #  3 - Alpha
        #  4 - Beta
        #  5 - Production/Stable
        #  6 - Mature
        #  7 - Inactive
        'Development Status :: 4 - Beta',
        # Indicate who your project is intended for
        'Intended Audience :: Developers',
        'Intended Audience :: Science/Research',
        'Intended Audience :: Information Technology',
        # Indicate what your project relates to
        'Topic :: Scientific/Engineering',
        'Topic :: Scientific/Engineering :: Mathematics',
        'Topic :: Scientific/Engineering :: Image Recognition',
        'Topic :: Scientific/Engineering :: Artificial Intelligence',
        'Topic :: Software Development :: Libraries',
        'Topic :: Software Development :: Libraries :: Python Modules',
        'Topic :: Utilities',
        # Pick your license as you wish (should match "license" above)
        'License :: OSI Approved :: Apache Software License',
        # Supported python versions
        'Programming Language :: Python :: 3',
        'Programming Language :: Python :: 3.5',
        'Programming Language :: Python :: 3.6',
        'Programming Language :: Python :: 3.7',
        'Programming Language :: Python :: 3.8',
        # Additional Setting
        'Environment :: Console',
        'Natural Language :: English',
        'Operating System :: OS Independent',
    ],
    packages=setuptools.find_packages(),
    install_requires=install_requires,
    setup_requires=['pytest-runner'],
    tests_require=tests_requirements,
    # List additional groups of dependencies here (e.g. development
    # dependencies). You can install these using the following syntax,
    # $ pip install -e ".[all]"
    # $ pip install nemo_toolkit[all]
    extras_require=extras_require,
    # Add in any packaged data.
    include_package_data=True,
    zip_safe=False,
    # PyPI package information.
    keywords=__keywords__,
    # Custom commands.
    cmdclass={'style': StyleCommand},
)


================================================
FILE: patches/nemo/1.6.2/requirements.original.txt
================================================
numpy>=1.21
onnx>=1.7.0
python-dateutil
torch
wrapt
ruamel.yaml
scikit-learn
sentencepiece<1.0.0
tqdm>=4.41.0
numba
wget
frozendict
unidecode


================================================
FILE: patches/nemo/1.6.2/requirements.txt
================================================
numpy
onnx>=1.7.0
python-dateutil
torch
wrapt
ruamel.yaml
scikit-learn
sentencepiece<1.0.0
tqdm>=4.41.0
numba
wget
frozendict
unidecode


================================================
FILE: patches/nemo/1.6.2/requirements_nlp.original.txt
================================================
boto3
h5py
matplotlib>=3.3.2
sentencepiece
youtokentome>=1.0.5
numpy
rapidfuzz
gdown
inflect
sacrebleu[ja]
sacremoses>=0.0.43
nltk>=3.6.5
fasttext
opencc
pangu
jieba
ftfy


================================================
FILE: patches/nemo/1.6.2/requirements_nlp.txt
================================================
boto3
h5py
matplotlib
sentencepiece
youtokentome>=1.0.5
numpy
gdown
inflect
sacremoses>=0.0.43
nltk>=3.6.5
fasttext
opencc
pangu
jieba
ftfy


================================================
FILE: patches/pytorch/1.6.0/functional.diff
================================================
2a3,5
> import librosa  # STFT patch for aarch64
> import numpy as np
> 
465c468,478
<     return _VF.stft(input, n_fft, hop_length, win_length, window, normalized, onesided)
---
>         
>     # STFT patch for aarch64
>     # https://stackoverflow.com/a/66872148
>     librosa_stft = librosa.stft(input.cpu().detach().numpy().reshape(-1), n_fft, hop_length, win_length, window="hann", center=center, pad_mode=pad_mode)
>     librosa_stft = np.array([[a.real, a.imag] for a in librosa_stft])
>     librosa_stft = np.transpose(librosa_stft, axes=[0, 2, 1])
>     librosa_stft = np.expand_dims(librosa_stft, 0)
>     librosa_stft = torch.from_numpy(librosa_stft)
>     return librosa_stft
>     #return _VF.stft(input, n_fft, hop_length, win_length, window,  # type: ignore
>     #                normalized, onesided, return_complex)


================================================
FILE: patches/pytorch/1.6.0/functional.original.py
================================================
from typing import Tuple, Optional

import torch
import torch.nn.functional as F
from ._lowrank import svd_lowrank, pca_lowrank
from ._overrides import has_torch_function, handle_torch_function
from ._jit_internal import boolean_dispatch, List
from ._jit_internal import _overload as overload

Tensor = torch.Tensor
from torch import _VF

__all__ = [
    'align_tensors',
    'broadcast_tensors',
    'cartesian_prod',
    'block_diag',
    'cdist',
    'chain_matmul',
    'einsum',
    'istft',
    'lu',
    'lu_unpack',
    'norm',
    'meshgrid',
    'pca_lowrank',
    'split',
    'stft',
    'svd_lowrank',
    'tensordot',
    'unique',
    'unique_consecutive',
]


def broadcast_tensors(*tensors):
    r"""broadcast_tensors(*tensors) -> List of Tensors

    Broadcasts the given tensors according to :ref:`broadcasting-semantics`.

    Args:
        *tensors: any number of tensors of the same type

    .. warning::

        More than one element of a broadcasted tensor may refer to a single
        memory location. As a result, in-place operations (especially ones that
        are vectorized) may result in incorrect behavior. If you need to write
        to the tensors, please clone them first.

    Example::

        >>> x = torch.arange(3).view(1, 3)
        >>> y = torch.arange(2).view(2, 1)
        >>> a, b = torch.broadcast_tensors(x, y)
        >>> a.size()
        torch.Size([2, 3])
        >>> a
        tensor([[0, 1, 2],
                [0, 1, 2]])
    """
    if not torch.jit.is_scripting():
        if any(type(t) is not Tensor for t in tensors) and has_torch_function(tensors):
            return handle_torch_function(broadcast_tensors, tensors, *tensors)
    return _VF.broadcast_tensors(tensors)


def split(tensor, split_size_or_sections, dim=0):
    r"""Splits the tensor into chunks. Each chunk is a view of the original tensor.

    If :attr:`split_size_or_sections` is an integer type, then :attr:`tensor` will
    be split into equally sized chunks (if possible). Last chunk will be smaller if
    the tensor size along the given dimension :attr:`dim` is not divisible by
    :attr:`split_size`.

    If :attr:`split_size_or_sections` is a list, then :attr:`tensor` will be split
    into ``len(split_size_or_sections)`` chunks with sizes in :attr:`dim` according
    to :attr:`split_size_or_sections`.

    Arguments:
        tensor (Tensor): tensor to split.
        split_size_or_sections (int) or (list(int)): size of a single chunk or
            list of sizes for each chunk
        dim (int): dimension along which to split the tensor.

    Example::
        >>> a = torch.arange(10).reshape(5,2)
        >>> a
        tensor([[0, 1],
                [2, 3],
                [4, 5],
                [6, 7],
                [8, 9]])
        >>> torch.split(a, 2)
        (tensor([[0, 1],
                 [2, 3]]),
         tensor([[4, 5],
                 [6, 7]]),
         tensor([[8, 9]]))
        >>> torch.split(a, [1,4])
        (tensor([[0, 1]]),
         tensor([[2, 3],
                 [4, 5],
                 [6, 7],
                 [8, 9]]))
    """
    if not torch.jit.is_scripting():
        if type(tensor) is not Tensor and has_torch_function((tensor,)):
            return handle_torch_function(split, (tensor,), tensor, split_size_or_sections,
                                         dim=dim)
    # Overwriting reason:
    # This dispatches to two ATen functions depending on the type of
    # split_size_or_sections. The branching code is in tensor.py, which we
    # call here.
    return tensor.split(split_size_or_sections, dim)

# equivalent to itertools.product(indices)
def _indices_product(indices):
    # type: (List[int]) -> (List[List[int]])
    empty_list = torch.jit.annotate(List[int], [])
    result = [empty_list]
    for idx in indices:
        result_temp = torch.jit.annotate(List[List[int]], [])
        for res in result:
            for i in range(idx):
                result_temp.append(res + [i])
        result = result_temp
    return result

def _index_tensor_with_indices_list(tensor, indices):
    # type: (Tensor, List[int]) -> Tensor
    out = tensor
    for index in indices:
        out = out[index]
    return out

def lu_unpack(LU_data, LU_pivots, unpack_data=True, unpack_pivots=True):
    # type: (Tensor, Tensor, bool, bool) ->  (Tuple[Optional[Tensor], Optional[Tensor], Optional[Tensor]])
    r"""Unpacks the data and pivots from a LU factorization of a tensor.

    Returns a tuple of tensors as ``(the pivots, the L tensor, the U tensor)``.

    Arguments:
        LU_data (Tensor): the packed LU factorization data
        LU_pivots (Tensor): the packed LU factorization pivots
        unpack_data (bool): flag indicating if the data should be unpacked
        unpack_pivots (bool): flag indicating if the pivots should be unpacked

    Examples::

        >>> A = torch.randn(2, 3, 3)
        >>> A_LU, pivots = A.lu()
        >>> P, A_L, A_U = torch.lu_unpack(A_LU, pivots)
        >>>
        >>> # can recover A from factorization
        >>> A_ = torch.bmm(P, torch.bmm(A_L, A_U))

        >>> # LU factorization of a rectangular matrix:
        >>> A = torch.randn(2, 3, 2)
        >>> A_LU, pivots = A.lu()
        >>> P, A_L, A_U = torch.lu_unpack(A_LU, pivots)
        >>> P
        tensor([[[1., 0., 0.],
                 [0., 1., 0.],
                 [0., 0., 1.]],

                [[0., 0., 1.],
                 [0., 1., 0.],
                 [1., 0., 0.]]])
        >>> A_L
        tensor([[[ 1.0000,  0.0000],
                 [ 0.4763,  1.0000],
                 [ 0.3683,  0.1135]],

                [[ 1.0000,  0.0000],
                 [ 0.2957,  1.0000],
                 [-0.9668, -0.3335]]])
        >>> A_U
        tensor([[[ 2.1962,  1.0881],
                 [ 0.0000, -0.8681]],

                [[-1.0947,  0.3736],
                 [ 0.0000,  0.5718]]])
        >>> A_ = torch.bmm(P, torch.bmm(A_L, A_U))
        >>> torch.norm(A_ - A)
        tensor(2.9802e-08)
    """
    if not torch.jit.is_scripting():
        tens_ops = (LU_data, LU_pivots)
        if any([type(t) is not Tensor for t in tens_ops]) and has_torch_function(tens_ops):
            return handle_torch_function(
                lu_unpack, tens_ops, LU_data, LU_pivots, unpack_data=unpack_data,
                unpack_pivots=unpack_pivots)
    shape = LU_data.shape
    # In generalized LU factorization, the following shape relations hold:
    #   A.shape[-2:] == (m, n)
    #   P.shape[-2:] == (m, m)
    #   L.shape[-2:] == (m, k)
    #   U.shape[-2:] == (k, n)
    # where k = min(m, n)
    m, n = shape[-2:]
    k = min(m, n)
    if unpack_data:
        U = LU_data.triu()
        if m != k:
            U = U.narrow(-2, 0, k)
        L = LU_data.tril()
        if k != n:
            L = L.narrow(-1, 0, k)
        L.diagonal(dim1=-2, dim2=-1).fill_(1)
    else:
        L = U = None

    if unpack_pivots:
        LU_pivots_zero_idx = LU_pivots - 1
        if LU_data.dim() > 2:
            P = torch.eye(m, device=LU_data.device, dtype=LU_data.dtype) \
                     .expand(shape[:-1] + (m,)) \
                     .clone(memory_format=torch.contiguous_format)

            # TODO: rewrite when TorchScript supports product and map as
            # product(*map(lambda x: list(range(x)), shape[:-2])) when issue 33781 is fixed
            indices = _indices_product(shape[:-2])
            for idx in indices:
                final_order = [i for i in range(m)]  # noqa: C416 TODO: rewrite as list(range(m))
                for k, j in enumerate(_index_tensor_with_indices_list(LU_pivots_zero_idx, idx)):
                    final_order[k], final_order[j] = final_order[j], final_order[k]
                # TODO: remove _index_tensor_with_indices_list when TorchScript supports indexing Tensor with list
                p_idx = _index_tensor_with_indices_list(P, idx)
                p_idx.copy_(p_idx.index_select(1, torch.as_tensor(final_order, device=LU_pivots.device)))
        else:
            P = torch.eye(m, device=LU_data.device, dtype=LU_data.dtype)
            final_order = [i for i in range(m)]  # noqa: C416 TODO: rewrite as list(range(m))
            for k, j, in enumerate(LU_pivots_zero_idx):
                final_order[k], final_order[j] = final_order[j], final_order[k]
            P = P.index_select(1, torch.as_tensor(final_order, device=LU_pivots.device))
    else:
        P = None

    return P, L, U


def einsum(equation, *operands):
    r"""einsum(equation, *operands) -> Tensor

This function provides a way of computing multilinear expressions (i.e. sums of products) using the
Einstein summation convention.

Args:
    equation (string): The equation is given in terms of lower case letters (indices) to be associated
           with each dimension of the operands and result. The left hand side lists the operands
           dimensions, separated by commas. There should be one index letter per tensor dimension.
           The right hand side follows after `->` and gives the indices for the output.
           If the `->` and right hand side are omitted, it implicitly defined as the alphabetically
           sorted list of all indices appearing exactly once in the left hand side.
           The indices not apprearing in the output are summed over after multiplying the operands
           entries.
           If an index appears several times for the same operand, a diagonal is taken.
           Ellipses `...` represent a fixed number of dimensions. If the right hand side is inferred,
           the ellipsis dimensions are at the beginning of the output.
    operands (Tensor): The operands to compute the Einstein sum of.

.. note::

    This function does not optimize the given expression, so a different formula for the same computation may
    run faster or consume less memory. Projects like opt_einsum (https://optimized-einsum.readthedocs.io/en/stable/)
    can optimize the formula for you.

Examples::

    >>> x = torch.randn(5)
    >>> y = torch.randn(4)
    >>> torch.einsum('i,j->ij', x, y)  # outer product
    tensor([[-0.0570, -0.0286, -0.0231,  0.0197],
            [ 1.2616,  0.6335,  0.5113, -0.4351],
            [ 1.4452,  0.7257,  0.5857, -0.4984],
            [-0.4647, -0.2333, -0.1883,  0.1603],
            [-1.1130, -0.5588, -0.4510,  0.3838]])


    >>> A = torch.randn(3,5,4)
    >>> l = torch.randn(2,5)
    >>> r = torch.randn(2,4)
    >>> torch.einsum('bn,anm,bm->ba', l, A, r) # compare torch.nn.functional.bilinear
    tensor([[-0.3430, -5.2405,  0.4494],
            [ 0.3311,  5.5201, -3.0356]])


    >>> As = torch.randn(3,2,5)
    >>> Bs = torch.randn(3,5,4)
    >>> torch.einsum('bij,bjk->bik', As, Bs) # batch matrix multiplication
    tensor([[[-1.0564, -1.5904,  3.2023,  3.1271],
             [-1.6706, -0.8097, -0.8025, -2.1183]],

            [[ 4.2239,  0.3107, -0.5756, -0.2354],
             [-1.4558, -0.3460,  1.5087, -0.8530]],

            [[ 2.8153,  1.8787, -4.3839, -1.2112],
             [ 0.3728, -2.1131,  0.0921,  0.8305]]])

    >>> A = torch.randn(3, 3)
    >>> torch.einsum('ii->i', A) # diagonal
    tensor([-0.7825,  0.8291, -0.1936])

    >>> A = torch.randn(4, 3, 3)
    >>> torch.einsum('...ii->...i', A) # batch diagonal
    tensor([[-1.0864,  0.7292,  0.0569],
            [-0.9725, -1.0270,  0.6493],
            [ 0.5832, -1.1716, -1.5084],
            [ 0.4041, -1.1690,  0.8570]])

    >>> A = torch.randn(2, 3, 4, 5)
    >>> torch.einsum('...ij->...ji', A).shape # batch permute
    torch.Size([2, 3, 5, 4])
"""
    if not torch.jit.is_scripting():
        if any(type(t) is not Tensor for t in operands) and has_torch_function(operands):
            return handle_torch_function(einsum, operands, equation, *operands)

    if len(operands) == 1 and isinstance(operands[0], (list, tuple)):
        # the old interface of passing the operands as one list argument
        operands = operands[0]
        # recurse incase operands contains value that has torch function
        # in the original implementation this line is omitted
        return einsum(equation, *operands)

    return _VF.einsum(equation, operands)


def meshgrid(*tensors):
    r"""Take :math:`N` tensors, each of which can be either scalar or 1-dimensional
vector, and create :math:`N` N-dimensional grids, where the :math:`i` :sup:`th` grid is defined by
expanding the :math:`i` :sup:`th` input over dimensions defined by other inputs.


    Args:
        tensors (list of Tensor): list of scalars or 1 dimensional tensors. Scalars will be
        treated as tensors of size :math:`(1,)` automatically

    Returns:
        seq (sequence of Tensors): If the input has :math:`k` tensors of size
        :math:`(N_1,), (N_2,), \ldots , (N_k,)`, then the output would also have :math:`k` tensors,
        where all tensors are of size :math:`(N_1, N_2, \ldots , N_k)`.

    Example::

        >>> x = torch.tensor([1, 2, 3])
        >>> y = torch.tensor([4, 5, 6])
        >>> grid_x, grid_y = torch.meshgrid(x, y)
        >>> grid_x
        tensor([[1, 1, 1],
                [2, 2, 2],
                [3, 3, 3]])
        >>> grid_y
        tensor([[4, 5, 6],
                [4, 5, 6],
                [4, 5, 6]])
    """
    if not torch.jit.is_scripting():
        if any(type(t) is not Tensor for t in tensors) and has_torch_function(tensors):
            return handle_torch_function(meshgrid, tensors, *tensors)
    if len(tensors) == 1 and isinstance(tensors[0], (list, tuple)):
        # the old interface of passing the operands as one list argument
        tensors = tensors[0]
    return _VF.meshgrid(tensors)


def stft(input, n_fft, hop_length=None, win_length=None, window=None,
         center=True, pad_mode='reflect', normalized=False, onesided=True):
    # type: (Tensor, int, Optional[int], Optional[int], Optional[Tensor], bool, str, bool, bool) -> Tensor
    r"""Short-time Fourier transform (STFT).

    Ignoring the optional batch dimension, this method computes the following
    expression:

    .. math::
        X[m, \omega] = \sum_{k = 0}^{\text{win\_length-1}}%
                            \text{window}[k]\ \text{input}[m \times \text{hop\_length} + k]\ %
                            \exp\left(- j \frac{2 \pi \cdot \omega k}{\text{win\_length}}\right),

    where :math:`m` is the index of the sliding window, and :math:`\omega` is
    the frequency that :math:`0 \leq \omega < \text{n\_fft}`. When
    :attr:`onesided` is the default value ``True``,

    * :attr:`input` must be either a 1-D time sequence or a 2-D batch of time
      sequences.

    * If :attr:`hop_length` is ``None`` (default), it is treated as equal to
      ``floor(n_fft / 4)``.

    * If :attr:`win_length` is ``None`` (default), it is treated as equal to
      :attr:`n_fft`.

    * :attr:`window` can be a 1-D tensor of size :attr:`win_length`, e.g., from
      :meth:`torch.hann_window`. If :attr:`window` is ``None`` (default), it is
      treated as if having :math:`1` everywhere in the window. If
      :math:`\text{win\_length} < \text{n\_fft}`, :attr:`window` will be padded on
      both sides to length :attr:`n_fft` before being applied.

    * If :attr:`center` is ``True`` (default), :attr:`input` will be padded on
      both sides so that the :math:`t`-th frame is centered at time
      :math:`t \times \text{hop\_length}`. Otherwise, the :math:`t`-th frame
      begins at time  :math:`t \times \text{hop\_length}`.

    * :attr:`pad_mode` determines the padding method used on :attr:`input` when
      :attr:`center` is ``True``. See :meth:`torch.nn.functional.pad` for
      all available options. Default is ``"reflect"``.

    * If :attr:`onesided` is ``True`` (default), only values for :math:`\omega`
      in :math:`\left[0, 1, 2, \dots, \left\lfloor \frac{\text{n\_fft}}{2} \right\rfloor + 1\right]`
      are returned because the real-to-complex Fourier transform satisfies the
      conjugate symmetry, i.e., :math:`X[m, \omega] = X[m, \text{n\_fft} - \omega]^*`.

    * If :attr:`normalized` is ``True`` (default is ``False``), the function
      returns the normalized STFT results, i.e., multiplied by :math:`(\text{frame\_length})^{-0.5}`.

    Returns the real and the imaginary parts together as one tensor of size
    :math:`(* \times N \times T \times 2)`, where :math:`*` is the optional
    batch size of :attr:`input`, :math:`N` is the number of frequencies where
    STFT is applied, :math:`T` is the total number of frames used, and each pair
    in the last dimension represents a complex number as the real part and the
    imaginary part.

    .. warning::
      This function changed signature at version 0.4.1. Calling with the
      previous signature may cause error or return incorrect result.

    Arguments:
        input (Tensor): the input tensor
        n_fft (int): size of Fourier transform
        hop_length (int, optional): the distance between neighboring sliding window
            frames. Default: ``None`` (treated as equal to ``floor(n_fft / 4)``)
        win_length (int, optional): the size of window frame and STFT filter.
            Default: ``None``  (treated as equal to :attr:`n_fft`)
        window (Tensor, optional): the optional window function.
            Default: ``None`` (treated as window of all :math:`1` s)
        center (bool, optional): whether to pad :attr:`input` on both sides so
            that the :math:`t`-th frame is centered at time :math:`t \times \text{hop\_length}`.
            Default: ``True``
        pad_mode (string, optional): controls the padding method used when
            :attr:`center` is ``True``. Default: ``"reflect"``
        normalized (bool, optional): controls whether to return the normalized STFT results
             Default: ``False``
        onesided (bool, optional): controls whether to return half of results to
            avoid redundancy Default: ``True``

    Returns:
        Tensor: A tensor containing the STFT result with shape described above

    """
    if not torch.jit.is_scripting():
        if type(input) is not Tensor and has_torch_function((input,)):
            return handle_torch_function(
                stft, (input,), input, n_fft, hop_length=hop_length, win_length=win_length,
                window=window, center=center, pad_mode=pad_mode, normalized=normalized,
                onesided=onesided)
    # TODO: after having proper ways to map Python strings to ATen Enum, move
    #       this and F.pad to ATen.
    if center:
        signal_dim = input.dim()
        extended_shape = [1] * (3 - signal_dim) + list(input.size())
        pad = int(n_fft // 2)
        input = F.pad(input.view(extended_shape), (pad, pad), pad_mode)
        input = input.view(input.shape[-signal_dim:])
    return _VF.stft(input, n_fft, hop_length, win_length, window, normalized, onesided)


def istft(input, n_fft, hop_length=None, win_length=None, window=None,
          center=True, normalized=False, onesided=True, length=None):
    # type: (Tensor, int, Optional[int], Optional[int], Optional[Tensor], bool, bool, bool, Optional[int]) -> Tensor
    r"""Inverse short time Fourier Transform. This is expected to be the inverse of :func:`~torch.stft`.
    It has the same parameters (+ additional optional parameter of :attr:`length`) and it should return the
    least squares estimation of the original signal. The algorithm will check using the NOLA condition (
    nonzero overlap).

    Important consideration in the parameters :attr:`window` and :attr:`center` so that the envelop
    created by the summation of all the windows is never zero at certain point in time. Specifically,
    :math:`\sum_{t=-\infty}^{\infty} w^2[n-t\times hop\_length] \cancel{=} 0`.

    Since :func:`~torch.stft` discards elements at the end of the signal if they do not fit in a frame,
    ``istft`` may return a shorter signal than the original signal (can occur if :attr:`center` is False
    since the signal isn't padded).

    If :attr:`center` is ``True``, then there will be padding e.g. ``'constant'``, ``'reflect'``, etc.
    Left padding can be trimmed off exactly because they can be calculated but right padding cannot be
    calculated without additional information.

    Example: Suppose the last window is:
    ``[17, 18, 0, 0, 0]`` vs ``[18, 0, 0, 0, 0]``

    The :attr:`n_fft`, :attr:`hop_length`, :attr:`win_length` are all the same which prevents the calculation
    of right padding. These additional values could be zeros or a reflection of the signal so providing
    :attr:`length` could be useful. If :attr:`length` is ``None`` then padding will be aggressively removed
    (some loss of signal).

    [1] D. W. Griffin and J. S. Lim, "Signal estimation from modified short-time Fourier transform,"
    IEEE Trans. ASSP, vol.32, no.2, pp.236-243, Apr. 1984.

    Arguments:
        input (Tensor): The input tensor. Expected to be output of :func:`~torch.stft`,
            either 3D (``fft_size``, ``n_frame``, 2) or 4D (``channel``, ``fft_size``, ``n_frame``, 2).
        n_fft (int): Size of Fourier transform
        hop_length (Optional[int]): The distance between neighboring sliding window frames.
            (Default: ``n_fft // 4``)
        win_length (Optional[int]): The size of window frame and STFT filter. (Default: ``n_fft``)
        window (Optional[torch.Tensor]): The optional window function.
            (Default: ``torch.ones(win_length)``)
        center (bool): Whether :attr:`input` was padded on both sides so that the :math:`t`-th frame is
            centered at time :math:`t \times \text{hop\_length}`.
            (Default: ``True``)
        normalized (bool): Whether the STFT was normalized. (Default: ``False``)
        onesided (bool): Whether the STFT is onesided. (Default: ``True``)
        length (Optional[int]): The amount to trim the signal by (i.e. the
            original signal length). (Default: whole signal)

    Returns:
        Tensor: Least squares estimation of the original signal of size (..., signal_length)
    """
    if not torch.jit.is_scripting():
        if type(input) is not Tensor and has_torch_function((input,)):
            return handle_torch_function(
                istft, (input,), input, n_fft, hop_length=hop_length, win_length=win_length,
                window=window, center=center, normalized=normalized, onesided=onesided,
                length=length)

    return _VF.istft(
        input, n_fft, hop_length, win_length, window, center, normalized, onesided, length)


del torch.unique_dim


def _unique_impl(input, sorted=True, return_inverse=False, return_counts=False, dim=None):
    # type: (Tensor, bool, bool, bool, Optional[int]) -> Tuple[Tensor, Tensor, Tensor]
    r"""Returns the unique elements of the input tensor.

    .. note:: This function is different from :func:`torch.unique_consecutive` in the sense that
        this function also eliminates non-consecutive duplicate values.

    .. note:: Currently in the CUDA implementation and the CPU implementation when dim is specified,
        `torch.unique` always sort the tensor at the beginning regardless of the `sort` argument.
        Sorting could be slow, so if your input tensor is already sorted, it is recommended to use
        :func:`torch.unique_consecutive` which avoids the sorting.

    Arguments:
        input (Tensor): the input tensor
        sorted (bool): Whether to sort the unique elements in ascending order
            before returning as output.
        return_inverse (bool): Whether to also return the indices for where
            elements in the original input ended up in the returned unique list.
        return_counts (bool): Whether to also return the counts for each unique
            element.
        dim (int): the dimension to apply unique. If ``None``, the unique of the
            flattened input is returned. default: ``None``

    Returns:
        (Tensor, Tensor (optional), Tensor (optional)): A tensor or a tuple of tensors containing

            - **output** (*Tensor*): the output list of unique scalar elements.
            - **inverse_indices** (*Tensor*): (optional) if
              :attr:`return_inverse` is True, there will be an additional
              returned tensor (same shape as input) representing the indices
              for where elements in the original input map to in the output;
              otherwise, this function will only return a single tensor.
            - **counts** (*Tensor*): (optional) if
              :attr:`return_counts` is True, there will be an additional
              returned tensor (same shape as output or output.size(dim),
              if dim was specified) representing the number of occurrences
              for each unique value or tensor.

    Example::

        >>> output = torch.unique(torch.tensor([1, 3, 2, 3], dtype=torch.long))
        >>> output
        tensor([ 2,  3,  1])

        >>> output, inverse_indices = torch.unique(
                torch.tensor([1, 3, 2, 3], dtype=torch.long), sorted=True, return_inverse=True)
        >>> output
        tensor([ 1,  2,  3])
        >>> inverse_indices
        tensor([ 0,  2,  1,  2])

        >>> output, inverse_indices = torch.unique(
                torch.tensor([[1, 3], [2, 3]], dtype=torch.long), sorted=True, return_inverse=True)
        >>> output
        tensor([ 1,  2,  3])
        >>> inverse_indices
        tensor([[ 0,  2],
                [ 1,  2]])

    """
    if not torch.jit.is_scripting():
        if type(input) is not Tensor and has_torch_function((input,)):
            return handle_torch_function(
                unique, (input,), input, sorted=sorted, return_inverse=return_inverse,
                return_counts=return_counts, dim=dim)

    if dim is not None:
        output, inverse_indices, counts = _VF.unique_dim(
            input,
            dim,
            sorted=sorted,
            return_inverse=return_inverse,
            return_counts=return_counts,
        )
    else:
        output, inverse_indices, counts = torch._unique2(
            input,
            sorted=sorted,
            return_inverse=return_inverse,
            return_counts=return_counts,
        )
    return output, inverse_indices, counts


def _unique_consecutive_impl(input, return_inverse=False, return_counts=False, dim=None):
    # type: (Tensor, bool, bool, Optional[int]) -> Tuple[Tensor, Tensor, Tensor]
    r"""Eliminates all but the first element from every consecutive group of equivalent elements.

    .. note:: This function is different from :func:`torch.unique` in the sense that this function
        only eliminates consecutive duplicate values. This semantics is similar to `std::unique`
        in C++.

    Arguments:
        input (Tensor): the input tensor
        return_inverse (bool): Whether to also return the indices for where
            elements in the original input ended up in the returned unique list.
        return_counts (bool): Whether to also return the counts for each unique
            element.
        dim (int): the dimension to apply unique. If ``None``, the unique of the
            flattened input is returned. default: ``None``

    Returns:
        (Tensor, Tensor (optional), Tensor (optional)): A tensor or a tuple of tensors containing

            - **output** (*Tensor*): the output list of unique scalar elements.
            - **inverse_indices** (*Tensor*): (optional) if
              :attr:`return_inverse` is True, there will be an additional
              returned tensor (same shape as input) representing the indices
              for where elements in the original input map to in the output;
              otherwise, this function will only return a single tensor.
            - **counts** (*Tensor*): (optional) if
              :attr:`return_counts` is True, there will be an additional
              returned tensor (same shape as output or output.size(dim),
              if dim was specified) representing the number of occurrences
              for each unique value or tensor.

    Example::

        >>> x = torch.tensor([1, 1, 2, 2, 3, 1, 1, 2])
        >>> output = torch.unique_consecutive(x)
        >>> output
        tensor([1, 2, 3, 1, 2])

        >>> output, inverse_indices = torch.unique_consecutive(x, return_inverse=True)
        >>> output
        tensor([1, 2, 3, 1, 2])
        >>> inverse_indices
        tensor([0, 0, 1, 1, 2, 3, 3, 4])

        >>> output, counts = torch.unique_consecutive(x, return_counts=True)
        >>> output
        tensor([1, 2, 3, 1, 2])
        >>> counts
        tensor([2, 2, 1, 2, 1])
    """
    if not torch.jit.is_scripting():
        if type(input) is not Tensor and has_torch_function((input,)):
            return handle_torch_function(
                unique_consecutive, (input,), input, return_inverse=return_inverse,
                return_counts=return_counts, dim=dim)
    output, inverse_indices, counts = _VF.unique_consecutive(
        input, return_inverse=return_inverse, return_counts=return_counts, dim=dim)
    return output, inverse_indices, counts


def _return_counts(input, sorted=True, return_inverse=False, return_counts=False, dim=None):
    # type: (Tensor, bool, bool, bool, Optional[int]) -> Tuple[Tensor, Tensor]

    if not torch.jit.is_scripting():
        if type(input) is not Tensor and has_torch_function((input,)):
            return _unique_impl(input, sorted, return_inverse, return_counts, dim)

    output, _, counts = _unique_impl(input, sorted, return_inverse, return_counts, dim)
    return output, counts

def _return_output(input, sorted=True, return_inverse=False, return_counts=False, dim=None):
    # type: (Tensor, bool, bool, bool, Optional[int]) -> Tensor

    if not torch.jit.is_scripting():
        if type(input) is not Tensor and has_torch_function((input,)):
            return _unique_impl(input, sorted, return_inverse, return_counts, dim)

    output, _, _ = _unique_impl(input, sorted, return_inverse, return_counts, dim)
    return output

def _return_inverse(input, sorted=True, return_inverse=False, return_counts=False, dim=None):
    # type: (Tensor, bool, bool, bool, Optional[int]) -> Tuple[Tensor, Tensor]

    if not torch.jit.is_scripting():
        if type(input) is not Tensor and has_torch_function((input,)):
            return _unique_impl(input, sorted, return_inverse, return_counts, dim)

    output, inverse_indices, _ = _unique_impl(input, sorted, return_inverse, return_counts, dim)
    return output, inverse_indices

_return_inverse_false = boolean_dispatch(
    arg_name='return_counts',
    arg_index=3,
    default=False,
    if_true=_return_counts,
    if_false=_return_output,
    module_name=__name__,
    func_name='unique')

_return_inverse_true = boolean_dispatch(
    arg_name='return_counts',
    arg_index=3,
    default=False,
    if_true=_unique_impl,
    if_false=_return_inverse,
    module_name=__name__,
    func_name='unique')

# The return type of unique depends on `return_inverse`, and `return_counts` so in order to
# resolve the output type in TorchScript we need to statically know the value of both parameters

unique = boolean_dispatch(
    arg_name='return_inverse',
    arg_index=2,
    default=False,
    if_true=_return_inverse_true,
    if_false=_return_inverse_false,
    module_name=__name__,
    func_name='unique')
unique.__doc__ = _unique_impl.__doc__


def _consecutive_return_counts(input, return_inverse=False, return_counts=False, dim=None):
    # type: (Tensor, bool, bool, Optional[int]) -> Tuple[Tensor, Tensor]

    if not torch.jit.is_scripting():
        if type(input) is not Tensor and has_torch_function((input,)):
            return _unique_consecutive_impl(input, return_inverse, return_counts, dim)

    output, _, counts = _unique_consecutive_impl(input, return_inverse, return_counts, dim)
    return output, counts

def _consecutive_return_output(input, return_inverse=False, return_counts=False, dim=None):
    # type: (Tensor, bool, bool, Optional[int]) -> Tensor

    if not torch.jit.is_scripting():
        if type(input) is not Tensor and has_torch_function((input,)):
            return _unique_consecutive_impl(input, return_inverse, return_counts, dim)

    output, _, _ = _unique_consecutive_impl(input, return_inverse, return_counts, dim)
    return output

def _consecutive_return_inverse(input, return_inverse=False, return_counts=False, dim=None):
    # type: (Tensor, bool, bool, Optional[int]) -> Tuple[Tensor, Tensor]

    if not torch.jit.is_scripting():
        if type(input) is not Tensor and has_torch_function((input,)):
            return _unique_consecutive_impl(input, return_inverse, return_counts, dim)

    output, inverse_indices, _ = _unique_consecutive_impl(input, return_inverse, return_counts, dim)
    return output, inverse_indices

_consecutive_return_inverse_false = boolean_dispatch(
    arg_name='return_counts',
    arg_index=1,
    default=False,
    if_true=_consecutive_return_counts,
    if_false=_consecutive_return_output,
    module_name=__name__,
    func_name='unique_consecutive')

_consecutive_return_inverse_true = boolean_dispatch(
    arg_name='return_counts',
    arg_index=1,
    default=False,
    if_true=_unique_consecutive_impl,
    if_false=_consecutive_return_inverse,
    module_name=__name__,
    func_name='unique_consecutive')

# The return type of unique depends on `return_inverse`, and `return_counts` so in order to
# resolve the output type in TorchScript we need to statically know the value of both parameters

unique_consecutive = boolean_dispatch(
    arg_name='return_inverse',
    arg_index=2,
    default=False,
    if_true=_consecutive_return_inverse_true,
    if_false=_consecutive_return_inverse_false,
    module_name=__name__,
    func_name='unique_consecutive')
unique_consecutive.__doc__ = _unique_consecutive_impl.__doc__


def tensordot(a, b, dims=2):
    r"""Returns a contraction of a and b over multiple dimensions.

    :attr:`tensordot` implements a generalized matrix product.

    Args:
      a (Tensor): Left tensor to contract
      b (Tensor): Right tensor to contract
      dims (int or tuple of two lists of integers): number of dimensions to
         contract or explicit lists of dimensions for :attr:`a` and
         :attr:`b` respectively

    When called with a non-negative integer argument :attr:`dims` = :math:`d`, and
    the number of dimensions of :attr:`a` and :attr:`b` is :math:`m` and :math:`n`,
    respectively, :func:`~torch.tensordot` computes

    .. math::
        r_{i_0,...,i_{m-d}, i_d,...,i_n}
          = \sum_{k_0,...,k_{d-1}} a_{i_0,...,i_{m-d},k_0,...,k_{d-1}} \times b_{k_0,...,k_{d-1}, i_d,...,i_n}.

    When called with :attr:`dims` of the list form, the given dimensions will be contracted
    in place of the last :math:`d` of :attr:`a` and the first :math:`d` of :math:`b`. The sizes
    in these dimensions must match, but :func:`~torch.tensordot` will deal with broadcasted
    dimensions.

    Examples::

        >>> a = torch.arange(60.).reshape(3, 4, 5)
        >>> b = torch.arange(24.).reshape(4, 3, 2)
        >>> torch.tensordot(a, b, dims=([1, 0], [0, 1]))
        tensor([[4400., 4730.],
                [4532., 4874.],
                [4664., 5018.],
                [4796., 5162.],
                [4928., 5306.]])

        >>> a = torch.randn(3, 4, 5, device='cuda')
        >>> b = torch.randn(4, 5, 6, device='cuda')
        >>> c = torch.tensordot(a, b, dims=2).cpu()
        tensor([[ 8.3504, -2.5436,  6.2922,  2.7556, -1.0732,  3.2741],
                [ 3.3161,  0.0704,  5.0187, -0.4079, -4.3126,  4.8744],
                [ 0.8223,  3.9445,  3.2168, -0.2400,  3.4117,  1.7780]])

    """
    if not torch.jit.is_scripting():
        if (type(a) is not Tensor or type(b) is not Tensor) and has_torch_function((a, b)):
            return handle_torch_function(tensordot, (a, b), a, b, dims=dims)
    if isinstance(dims, (list, tuple)) or \
       (isinstance(dims, torch.Tensor) and dims.numel() > 1):
        dims_a, dims_b = dims
    else:
        if isinstance(dims, torch.Tensor):
            dims = dims.item()
        if dims < 0:
            raise RuntimeError("tensordot expects dims >= 0, but got dims={}".format(dims))
        dims_a = list(range(-dims, 0))
        dims_b = list(range(dims))
    return _VF.tensordot(a, b, dims_a, dims_b)

def cartesian_prod(*tensors):
    """Do cartesian product of the given sequence of tensors. The behavior is similar to
    python's `itertools.product`.

    Arguments:
        *tensors: any number of 1 dimensional tensors.

    Returns:
        Tensor: A tensor equivalent to converting all the input tensors into lists,
            do `itertools.product` on these lists, and finally convert the resulting list
            into tensor.

    Example::

        >>> a = [1, 2, 3]
        >>> b = [4, 5]
        >>> list(itertools.product(a, b))
        [(1, 4), (1, 5), (2, 4), (2, 5), (3, 4), (3, 5)]
        >>> tensor_a = torch.tensor(a)
        >>> tensor_b = torch.tensor(b)
        >>> torch.cartesian_prod(tensor_a, tensor_b)
        tensor([[1, 4],
                [1, 5],
                [2, 4],
                [2, 5],
                [3, 4],
                [3, 5]])
    """
    if not torch.jit.is_scripting():
        if any(type(t) is not Tensor for t in tensors) and has_torch_function(tensors):
            return handle_torch_function(cartesian_prod, tensors, *tensors)
    return _VF.cartesian_prod(tensors)

def block_diag(*tensors):
    """Create a block diagonal matrix from provided tensors.

    Arguments:
        *tensors: One or more tensors with 0, 1, or 2 dimensions.

    Returns:
        Tensor: A 2 dimensional tensor with all the input tensors arranged in
            order such that their upper left and lower right corners are
            diagonally adjacent. All other elements are set to 0.

    Example::

        >>> import torch
        >>> A = torch.tensor([[0, 1], [1, 0]])
        >>> B = torch.tensor([[3, 4, 5], [6, 7, 8]])
        >>> C = torch.tensor(7)
        >>> D = torch.tensor([1, 2, 3])
        >>> E = torch.tensor([[4], [5], [6]])
        >>> torch.block_diag(A, B, C, D, E)
        tensor([[0, 1, 0, 0, 0, 0, 0, 0, 0, 0],
                [1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                [0, 0, 3, 4, 5, 0, 0, 0, 0, 0],
                [0, 0, 6, 7, 8, 0, 0, 0, 0, 0],
                [0, 0, 0, 0, 0, 7, 0, 0, 0, 0],
                [0, 0, 0, 0, 0, 0, 1, 2, 3, 0],
                [0, 0, 0, 0, 0, 0, 0, 0, 0, 4],
                [0, 0, 0, 0, 0, 0, 0, 0, 0, 5],
                [0, 0, 0, 0, 0, 0, 0, 0, 0, 6]])
    """
    if any(type(t) is not Tensor for t in tensors) and has_torch_function(tensors):
        return handle_torch_function(block_diag, tensors, *tensors)
    return torch._C._VariableFunctions.block_diag(tensors)


def cdist(x1, x2, p=2., compute_mode='use_mm_for_euclid_dist_if_necessary'):
    # type: (Tensor, Tensor, float, str) -> (Tensor)
    r"""Computes batched the p-norm distance between each pair of the two collections of row vectors.

    Args:
        x1 (Tensor): input tensor of shape :math:`B \times P \times M`.
        x2 (Tensor): input tensor of shape :math:`B \times R \times M`.
        p: p value for the p-norm distance to calculate between each vector pair
            :math:`\in [0, \infty]`.
        compute_mode:
            'use_mm_for_euclid_dist_if_necessary' - will use matrix multiplication approach to calculate
            euclidean distance (p = 2) if P > 25 or R > 25
            'use_mm_for_euclid_dist' - will always use matrix multiplication approach to calculate
            euclidean distance (p = 2)
            'donot_use_mm_for_euclid_dist' - will never use matrix multiplication approach to calculate
            euclidean distance (p = 2)
            Default: use_mm_for_euclid_dist_if_necessary.

    If x1 has shape :math:`B \times P \times M` and x2 has shape :math:`B \times R \times M` then the
    output will have shape :math:`B \times P \times R`.

    This function is equivalent to `scipy.spatial.distance.cdist(input,'minkowski', p=p)`
    if :math:`p \in (0, \infty)`. When :math:`p = 0` it is equivalent to
    `scipy.spatial.distance.cdist(input, 'hamming') * M`. When :math:`p = \infty`, the closest
    scipy function is `scipy.spatial.distance.cdist(xn, lambda x, y: np.abs(x - y).max())`.

    Example:

        >>> a = torch.tensor([[0.9041,  0.0196], [-0.3108, -2.4423], [-0.4821,  1.059]])
        >>> a
        tensor([[ 0.9041,  0.0196],
                [-0.3108, -2.4423],
                [-0.4821,  1.0590]])
        >>> b = torch.tensor([[-2.1763, -0.4713], [-0.6986,  1.3702]])
        >>> b
        tensor([[-2.1763, -0.4713],
                [-0.6986,  1.3702]])
        >>> torch.cdist(a, b, p=2)
        tensor([[3.1193, 2.0959],
                [2.7138, 3.8322],
                [2.2830, 0.3791]])
    """
    if not torch.jit.is_scripting():
        if (type(x1) is not Tensor or type(x2) is not Tensor) and has_torch_function((x1, x2)):
            return handle_torch_function(
                cdist, (x1, x2), x1, x2, p=p, compute_mode=compute_mode)
    if compute_mode == 'use_mm_for_euclid_dist_if_necessary':
        return _VF.cdist(x1, x2, p, None)
    elif compute_mode == 'use_mm_for_euclid_dist':
        return _VF.cdist(x1, x2, p, 1)
    elif compute_mode == 'donot_use_mm_for_euclid_dist':
        return _VF.cdist(x1, x2, p, 2)
    else:
        raise ValueError("{} is not a valid value for compute_mode".format(compute_mode))

# TODO: type dim as BroadcastingList when https://github.com/pytorch/pytorch/issues/33782 is fixed
@overload  # noqa: 749
def norm(input, p="fro", dim=None, keepdim=False, out=None, dtype=None):  # noqa: 749
    # type: (Tensor, str, Optional[List[int]], bool, Optional[Tensor], Optional[int]) -> Tensor
    pass

@overload  # noqa: 749
def norm(input, p="fro", dim=None, keepdim=False, out=None, dtype=None):  # noqa: 749
    # type: (Tensor, Optional[number], Optional[List[int]], bool, Optional[Tensor], Optional[int]) -> Tensor
    pass

@overload  # noqa: 749
def norm(input, p="fro", dim=None, keepdim=False, out=None, dtype=None):  # noqa: 749
    # type: (Tensor, Optional[number], Optional[int], bool, Optional[Tensor], Optional[int]) -> Tensor
    pass

@overload  # noqa: 749
def norm(input, p="fro", dim=None, keepdim=False, out=None, dtype=None):  # noqa: 749
    # type: (Tensor, str, Optional[int], bool, Optional[Tensor], Optional[int]) -> Tensor
    pass

def norm(input, p="fro", dim=None, keepdim=False, out=None, dtype=None):  # noqa: 749
    r"""Returns the matrix norm or vector norm of a given tensor.

    Args:
        input (Tensor): the input tensor
        p (int, float, inf, -inf, 'fro', 'nuc', optional): the order of norm. Default: ``'fro'``
            The following norms can be calculated:

            =====  ============================  ==========================
            ord    matrix norm                   vector norm
            =====  ============================  ==========================
            None   Frobenius norm                2-norm
            'fro'  Frobenius norm                --
            'nuc'  nuclear norm                  --
            Other  as vec norm when dim is None  sum(abs(x)**ord)**(1./ord)
            =====  ============================  ==========================

        dim (int, 2-tuple of ints, 2-list of ints, optional): If it is an int,
            vector norm will be calculated, if it is 2-tuple of ints, matrix norm
            will be calculated. If the value is None, matrix norm will be calculated
            when the input tensor only has two dimensions, vector norm will be
            calculated when the input tensor only has one dimension. If the input
            tensor has more than two dimensions, the vector norm will be applied to
            last dimension.
        keepdim (bool, optional): whether the output tensors have :attr:`dim`
            retained or not. Ignored if :attr:`dim` = ``None`` and
            :attr:`out` = ``None``. Default: ``False``
        out (Tensor, optional): the output tensor. Ignored if
            :attr:`dim` = ``None`` and :attr:`out` = ``None``.
        dtype (:class:`torch.dtype`, optional): the desired data type of
            returned tensor. If specified, the input tensor is casted to
            :attr:'dtype' while performing the operation. Default: None.


    Example::

        >>> import torch
        >>> a = torch.arange(9, dtype= torch.float) - 4
        >>> b = a.reshape((3, 3))
        >>> torch.norm(a)
        tensor(7.7460)
        >>> torch.norm(b)
        tensor(7.7460)
        >>> torch.norm(a, float('inf'))
        tensor(4.)
        >>> torch.norm(b, float('inf'))
        tensor(4.)
        >>> c = torch.tensor([[ 1, 2, 3],[-1, 1, 4]] , dtype= torch.float)
        >>> torch.norm(c, dim=0)
        tensor([1.4142, 2.2361, 5.0000])
        >>> torch.norm(c, dim=1)
        tensor([3.7417, 4.2426])
        >>> torch.norm(c, p=1, dim=1)
        tensor([6., 6.])
        >>> d = torch.arange(8, dtype= torch.float).reshape(2,2,2)
        >>> torch.norm(d, dim=(1,2))
        tensor([ 3.7417, 11.2250])
        >>> torch.norm(d[0, :, :]), torch.norm(d[1, :, :])
        (tensor(3.7417), tensor(11.2250))
    """
    if not torch.jit.is_scripting():
        if type(input) is not Tensor and has_torch_function((input,)):
            return handle_torch_function(
                norm, (input,), input, p=p, dim=dim, keepdim=keepdim, out=out, dtype=dtype)

    ndim = input.dim()


    # catch default case
    if dim is None and out is None and dtype is None and p is not None:
        if isinstance(p, str):
            if p == "fro":
                return _VF.frobenius_norm(input)
        if not isinstance(p, str):
            return _VF.norm(input, p)

    # TODO: when https://github.com/pytorch/pytorch/issues/33782 is fixed
    # remove the overloads where dim is an int and replace with BraodcastingList1
    # and remove next four lines, replace _dim with dim
    if dim is not None:
        if isinstance(dim, int):
            _dim = [dim]
        else:
            _dim = dim
    else:
        _dim = None

    if isinstance(p, str):
        if p == "fro":
            if dtype is not None:
                raise ValueError("dtype argument is not supported in frobenius norm")

            if _dim is None:
                _dim = [i for i in range(ndim)]  # noqa: C416 TODO: rewrite as list(range(m))
            if out is None:
                return _VF.frobenius_norm(input, _dim, keepdim=keepdim)
            else:
                return _VF.frobenius_norm(input, _dim, keepdim=keepdim, out=out)
        elif p == "nuc":
            if dtype is not None:
                raise ValueError("dtype argument is not supported in nuclear norm")
            if _dim is None:
                if out is None:
                    return _VF.nuclear_norm(input, keepdim=keepdim)
                else:
                    return _VF.nuclear_norm(input, keepdim=keepdim, out=out)
            else:
                if out is None:
                    return _VF.nuclear_norm(input, _dim, keepdim=keepdim)
                else:
                    return _VF.nuclear_norm(input, _dim, keepdim=keepdim, out=out)
        raise RuntimeError("only valid string values are 'fro' and 'nuc', found {}".format(p))
    else:
        if _dim is None:
            _dim = [i for i in range(ndim)]  # noqa: C416 TODO: rewrite as list(range(m))

        if out is None:
            if dtype is None:
                return _VF.norm(input, p, _dim, keepdim=keepdim)
            else:
                return _VF.norm(input, p, _dim, keepdim=keepdim, dtype=dtype)
        else:
            if dtype is None:
                return _VF.norm(input, p, _dim, keepdim=keepdim, out=out)
            else:
                return _VF.norm(input, p, _dim, keepdim=keepdim, dtype=dtype, out=out)

def chain_matmul(*matrices):
    r"""Returns the matrix product of the :math:`N` 2-D tensors. This product is efficiently computed
    using the matrix chain order algorithm which selects the order in which incurs the lowest cost in terms
    of arithmetic operations (`[CLRS]`_). Note that since this is a function to compute the product, :math:`N`
    needs to be greater than or equal to 2; if equal to 2 then a trivial matrix-matrix product is returned.
    If :math:`N` is 1, then this is a no-op - the original matrix is returned as is.


    Args:
        matrices (Tensors...): a sequence of 2 or more 2-D tensors whose product is to be determined.


    Returns:
        Tensor: if the :math:`i^{th}` tensor was of dimensions :math:`p_{i} \times p_{i + 1}`, then the product
        would be of dimensions :math:`p_{1} \times p_{N + 1}`.

    Example::

        >>> a = torch.randn(3, 4)
        >>> b = torch.randn(4, 5)
        >>> c = torch.randn(5, 6)
        >>> d = torch.randn(6, 7)
        >>> torch.chain_matmul(a, b, c, d)
        tensor([[ -2.3375,  -3.9790,  -4.1119,  -6.6577,   9.5609, -11.5095,  -3.2614],
                [ 21.4038,   3.3378,  -8.4982,  -5.2457, -10.2561,  -2.4684,   2.7163],
                [ -0.9647,  -5.8917,  -2.3213,  -5.2284,  12.8615, -12.2816,  -2.5095]])

    .. _`[CLRS]`: https://mitpress.mit.edu/books/introduction-algorithms-third-edition
    """
    if not torch.jit.is_scripting():
        if any(type(t) is not Tensor for t in matrices) and has_torch_function(matrices):
            return handle_torch_function(chain_matmul, matrices, *matrices)
    return _VF.chain_matmul(matrices)


def _lu_impl(A, pivot=True, get_infos=False, out=None):
    # type: (Tensor, bool, bool, Any) -> Tuple[Tensor, Tensor, Tensor]
    r"""Computes the LU factorization of a matrix or batches of matrices
    :attr:`A`. Returns a tuple containing the LU factorization and
    pivots of :attr:`A`.  Pivoting is done if :attr:`pivot` is set to
    ``True``.

    .. note::
        The pivots returned by the function are 1-indexed. If :attr:`pivot` is ``False``,
        then the returned pivots is a tensor filled with zeros of the appropriate size.

    .. note::
        LU factorization with :attr:`pivot` = ``False`` is not available for CPU, and attempting
        to do so will throw an error. However, LU factorization with :attr:`pivot` = ``False`` is
        available for CUDA.

    .. note::
        This function does not check if the factorization was successful or not if
        :attr:`get_infos` is ``True`` since the status of the factorization is present in the
        third element of the return tuple.

    .. note::
        In the case of batches of square matrices with size less or
        equal to 32 on a CUDA device, the LU factorization is repeated
        for singular matrices due to the bug in the MAGMA library (see
        magma issue 13).

    .. note::
       ``L``, ``U``, and ``P`` can be derived using :func:`torch.lu_unpack`.

    Arguments:
        A (Tensor): the tensor to factor of size :math:`(*, m, n)`
        pivot (bool, optional): controls whether pivoting is done. Default: ``True``
        get_infos (bool, optional): if set to ``True``, returns an info IntTensor.
                                    Default: ``False``
        out (tuple, optional): optional output tuple. If :attr:`get_infos` is ``True``,
                               then the elements in the tuple are Tensor, IntTensor,
                               and IntTensor. If :attr:`get_infos` is ``False``, then the
                               elements in the tuple are Tensor, IntTensor. Default: ``None``

    Returns:
        (Tensor, IntTensor, IntTensor (optional)): A tuple of tensors containing

            - **factorization** (*Tensor*): the factorization of size :math:`(*, m, n)`

            - **pivots** (*IntTensor*): the pivots of size :math:`(*, m)`

            - **infos** (*IntTensor*, *optional*): if :attr:`get_infos` is ``True``, this is a tensor of
              size :math:`(*)` where non-zero values indicate whether factorization for the matrix or
              each minibatch has succeeded or failed

    Example::

        >>> A = torch.randn(2, 3, 3)
        >>> A_LU, pivots = torch.lu(A)
        >>> A_LU
        tensor([[[ 1.3506,  2.5558, -0.0816],
                 [ 0.1684,  1.1551,  0.1940],
                 [ 0.1193,  0.6189, -0.5497]],

                [[ 0.4526,  1.2526, -0.3285],
                 [-0.7988,  0.7175, -0.9701],
                 [ 0.2634, -0.9255, -0.3459]]])
        >>> pivots
        tensor([[ 3,  3,  3],
                [ 3,  3,  3]], dtype=torch.int32)
        >>> A_LU, pivots, info = torch.lu(A, get_infos=True)
        >>> if info.nonzero().size(0) == 0:
        ...   print('LU factorization succeeded for all samples!')
        LU factorization succeeded for all samples!
    """
    # If get_infos is True, then we don't need to check for errors and vice versa
    return torch._lu_with_info(A, pivot=pivot, check_errors=(not get_infos))

def _check_list_size(out_len, get_infos, out):
    # type: (int, bool, List[Tensor]) -> None
    get_infos_int = 1 if get_infos else 0
    if out_len - get_infos_int != 2:
        raise TypeError("expected tuple of {} elements but got {}"
                        .format(2 + int(get_infos), len(out_len)))
    if not isinstance(out, (tuple, list)):
        raise TypeError("argument 'out' must be tuple of Tensors, not {}"
                        .format(type(out).__name__))

def _lu_with_infos(A, pivot=True, get_infos=False, out=None):
    # type: (Tensor, bool, bool, Optional[Tuple[Tensor, Tensor, Tensor]]) -> Tuple[Tensor, Tensor, Tensor]
    if not torch.jit.is_scripting():
        if type(A) is not Tensor and has_torch_function((A,)):
            return handle_torch_function(
                lu, (A,), A, pivot=pivot, get_infos=get_infos, out=out)
    result = _lu_impl(A, pivot, get_infos, out)
    if out is not None:
        _check_list_size(len(out), get_infos, out)
        for i in range(len(out)):
            out[i].resize_as_(result[i]).copy_(result[i])
        return out
    else:
        return result  # A_LU, pivots, infos

def _lu_no_infos(A, pivot=True, get_infos=False, out=None):
    # type: (Tensor, bool, bool, Optional[Tuple[Tensor, Tensor]]) -> Tuple[Tensor, Tensor]
    # need to check for torch_function here so that we exit if
    if not torch.jit.is_scripting():
        if type(A) is not Tensor and has_torch_function((A,)):
            return handle_torch_function(
                lu, (A,), A, pivot=pivot, get_infos=get_infos, out=out)
    result = _lu_impl(A, pivot, get_infos, out)
    if out is not None:
        _check_list_size(len(out), get_infos, out)
        for i in range(len(out)):
            out[i].resize_as_(result[i]).copy_(result[i])
        return out
    else:
        return result[0], result[1]  # A_LU, pivots

# The return type of lu depends on `get_infos`, so in order to resolve the output type
# of lu in TorchScript we need to statically know the value of `get_infos`
lu = boolean_dispatch(
    arg_name='get_infos',
    arg_index=2,
    default=False,
    if_true=_lu_with_infos,
    if_false=_lu_no_infos,
    module_name=__name__,
    func_name='lu')
lu.__doc__ = _lu_impl.__doc__

def align_tensors(*tensors):
    raise RuntimeError('`align_tensors` not yet implemented.')


================================================
FILE: patches/pytorch/1.6.0/functional.py
================================================
from typing import Tuple, Optional

import librosa  # STFT patch for aarch64
import numpy as np

import torch
import torch.nn.functional as F
from ._lowrank import svd_lowrank, pca_lowrank
from ._overrides import has_torch_function, handle_torch_function
from ._jit_internal import boolean_dispatch, List
from ._jit_internal import _overload as overload

Tensor = torch.Tensor
from torch import _VF

__all__ = [
    'align_tensors',
    'broadcast_tensors',
    'cartesian_prod',
    'block_diag',
    'cdist',
    'chain_matmul',
    'einsum',
    'istft',
    'lu',
    'lu_unpack',
    'norm',
    'meshgrid',
    'pca_lowrank',
    'split',
    'stft',
    'svd_lowrank',
    'tensordot',
    'unique',
    'unique_consecutive',
]


def broadcast_tensors(*tensors):
    r"""broadcast_tensors(*tensors) -> List of Tensors

    Broadcasts the given tensors according to :ref:`broadcasting-semantics`.

    Args:
        *tensors: any number of tensors of the same type

    .. warning::

        More than one element of a broadcasted tensor may refer to a single
        memory location. As a result, in-place operations (especially ones that
        are vectorized) may result in incorrect behavior. If you need to write
        to the tensors, please clone them first.

    Example::

        >>> x = torch.arange(3).view(1, 3)
        >>> y = torch.arange(2).view(2, 1)
        >>> a, b = torch.broadcast_tensors(x, y)
        >>> a.size()
        torch.Size([2, 3])
        >>> a
        tensor([[0, 1, 2],
                [0, 1, 2]])
    """
    if not torch.jit.is_scripting():
        if any(type(t) is not Tensor for t in tensors) and has_torch_function(tensors):
            return handle_torch_function(broadcast_tensors, tensors, *tensors)
    return _VF.broadcast_tensors(tensors)


def split(tensor, split_size_or_sections, dim=0):
    r"""Splits the tensor into chunks. Each chunk is a view of the original tensor.

    If :attr:`split_size_or_sections` is an integer type, then :attr:`tensor` will
    be split into equally sized chunks (if possible). Last chunk will be smaller if
    the tensor size along the given dimension :attr:`dim` is not divisible by
    :attr:`split_size`.

    If :attr:`split_size_or_sections` is a list, then :attr:`tensor` will be split
    into ``len(split_size_or_sections)`` chunks with sizes in :attr:`dim` according
    to :attr:`split_size_or_sections`.

    Arguments:
        tensor (Tensor): tensor to split.
        split_size_or_sections (int) or (list(int)): size of a single chunk or
            list of sizes for each chunk
        dim (int): dimension along which to split the tensor.

    Example::
        >>> a = torch.arange(10).reshape(5,2)
        >>> a
        tensor([[0, 1],
                [2, 3],
                [4, 5],
                [6, 7],
                [8, 9]])
        >>> torch.split(a, 2)
        (tensor([[0, 1],
                 [2, 3]]),
         tensor([[4, 5],
                 [6, 7]]),
         tensor([[8, 9]]))
        >>> torch.split(a, [1,4])
        (tensor([[0, 1]]),
         tensor([[2, 3],
                 [4, 5],
                 [6, 7],
                 [8, 9]]))
    """
    if not torch.jit.is_scripting():
        if type(tensor) is not Tensor and has_torch_function((tensor,)):
            return handle_torch_function(split, (tensor,), tensor, split_size_or_sections,
                                         dim=dim)
    # Overwriting reason:
    # This dispatches to two ATen functions depending on the type of
    # split_size_or_sections. The branching code is in tensor.py, which we
    # call here.
    return tensor.split(split_size_or_sections, dim)

# equivalent to itertools.product(indices)
def _indices_product(indices):
    # type: (List[int]) -> (List[List[int]])
    empty_list = torch.jit.annotate(List[int], [])
    result = [empty_list]
    for idx in indices:
        result_temp = torch.jit.annotate(List[List[int]], [])
        for res in result:
            for i in range(idx):
                result_temp.append(res + [i])
        result = result_temp
    return result

def _index_tensor_with_indices_list(tensor, indices):
    # type: (Tensor, List[int]) -> Tensor
    out = tensor
    for index in indices:
        out = out[index]
    return out

def lu_unpack(LU_data, LU_pivots, unpack_data=True, unpack_pivots=True):
    # type: (Tensor, Tensor, bool, bool) ->  (Tuple[Optional[Tensor], Optional[Tensor], Optional[Tensor]])
    r"""Unpacks the data and pivots from a LU factorization of a tensor.

    Returns a tuple of tensors as ``(the pivots, the L tensor, the U tensor)``.

    Arguments:
        LU_data (Tensor): the packed LU factorization data
        LU_pivots (Tensor): the packed LU factorization pivots
        unpack_data (bool): flag indicating if the data should be unpacked
        unpack_pivots (bool): flag indicating if the pivots should be unpacked

    Examples::

        >>> A = torch.randn(2, 3, 3)
        >>> A_LU, pivots = A.lu()
        >>> P, A_L, A_U = torch.lu_unpack(A_LU, pivots)
        >>>
        >>> # can recover A from factorization
        >>> A_ = torch.bmm(P, torch.bmm(A_L, A_U))

        >>> # LU factorization of a rectangular matrix:
        >>> A = torch.randn(2, 3, 2)
        >>> A_LU, pivots = A.lu()
        >>> P, A_L, A_U = torch.lu_unpack(A_LU, pivots)
        >>> P
        tensor([[[1., 0., 0.],
                 [0., 1., 0.],
                 [0., 0., 1.]],

                [[0., 0., 1.],
                 [0., 1., 0.],
                 [1., 0., 0.]]])
        >>> A_L
        tensor([[[ 1.0000,  0.0000],
                 [ 0.4763,  1.0000],
                 [ 0.3683,  0.1135]],

                [[ 1.0000,  0.0000],
                 [ 0.2957,  1.0000],
                 [-0.9668, -0.3335]]])
        >>> A_U
        tensor([[[ 2.1962,  1.0881],
                 [ 0.0000, -0.8681]],

                [[-1.0947,  0.3736],
                 [ 0.0000,  0.5718]]])
        >>> A_ = torch.bmm(P, torch.bmm(A_L, A_U))
        >>> torch.norm(A_ - A)
        tensor(2.9802e-08)
    """
    if not torch.jit.is_scripting():
        tens_ops = (LU_data, LU_pivots)
        if any([type(t) is not Tensor for t in tens_ops]) and has_torch_function(tens_ops):
            return handle_torch_function(
                lu_unpack, tens_ops, LU_data, LU_pivots, unpack_data=unpack_data,
                unpack_pivots=unpack_pivots)
    shape = LU_data.shape
    # In generalized LU factorization, the following shape relations hold:
    #   A.shape[-2:] == (m, n)
    #   P.shape[-2:] == (m, m)
    #   L.shape[-2:] == (m, k)
    #   U.shape[-2:] == (k, n)
    # where k = min(m, n)
    m, n = shape[-2:]
    k = min(m, n)
    if unpack_data:
        U = LU_data.triu()
        if m != k:
            U = U.narrow(-2, 0, k)
        L = LU_data.tril()
        if k != n:
            L = L.narrow(-1, 0, k)
        L.diagonal(dim1=-2, dim2=-1).fill_(1)
    else:
        L = U = None

    if unpack_pivots:
        LU_pivots_zero_idx = LU_pivots - 1
        if LU_data.dim() > 2:
            P = torch.eye(m, device=LU_data.device, dtype=LU_data.dtype) \
                     .expand(shape[:-1] + (m,)) \
                     .clone(memory_format=torch.contiguous_format)

            # TODO: rewrite when TorchScript supports product and map as
            # product(*map(lambda x: list(range(x)), shape[:-2])) when issue 33781 is fixed
            indices = _indices_product(shape[:-2])
            for idx in indices:
                final_order = [i for i in range(m)]  # noqa: C416 TODO: rewrite as list(range(m))
                for k, j in enumerate(_index_tensor_with_indices_list(LU_pivots_zero_idx, idx)):
                    final_order[k], final_order[j] = final_order[j], final_order[k]
                # TODO: remove _index_tensor_with_indices_list when TorchScript supports indexing Tensor with list
                p_idx = _index_tensor_with_indices_list(P, idx)
                p_idx.copy_(p_idx.index_select(1, torch.as_tensor(final_order, device=LU_pivots.device)))
        else:
            P = torch.eye(m, device=LU_data.device, dtype=LU_data.dtype)
            final_order = [i for i in range(m)]  # noqa: C416 TODO: rewrite as list(range(m))
            for k, j, in enumerate(LU_pivots_zero_idx):
                final_order[k], final_order[j] = final_order[j], final_order[k]
            P = P.index_select(1, torch.as_tensor(final_order, device=LU_pivots.device))
    else:
        P = None

    return P, L, U


def einsum(equation, *operands):
    r"""einsum(equation, *operands) -> Tensor

This function provides a way of computing multilinear expressions (i.e. sums of products) using the
Einstein summation convention.

Args:
    equation (string): The equation is given in terms of lower case letters (indices) to be associated
           with each dimension of the operands and result. The left hand side lists the operands
           dimensions, separated by commas. There should be one index letter per tensor dimension.
           The right hand side follows after `->` and gives the indices for the output.
           If the `->` and right hand side are omitted, it implicitly defined as the alphabetically
           sorted list of all indices appearing exactly once in the left hand side.
           The indices not apprearing in the output are summed over after multiplying the operands
           entries.
           If an index appears several times for the same operand, a diagonal is taken.
           Ellipses `...` represent a fixed number of dimensions. If the right hand side is inferred,
           the ellipsis dimensions are at the beginning of the output.
    operands (Tensor): The operands to compute the Einstein sum of.

.. note::

    This function does not optimize the given expression, so a different formula for the same computation may
    run faster or consume less memory. Projects like opt_einsum (https://optimized-einsum.readthedocs.io/en/stable/)
    can optimize the formula for you.

Examples::

    >>> x = torch.randn(5)
    >>> y = torch.randn(4)
    >>> torch.einsum('i,j->ij', x, y)  # outer product
    tensor([[-0.0570, -0.0286, -0.0231,  0.0197],
            [ 1.2616,  0.6335,  0.5113, -0.4351],
            [ 1.4452,  0.7257,  0.5857, -0.4984],
            [-0.4647, -0.2333, -0.1883,  0.1603],
            [-1.1130, -0.5588, -0.4510,  0.3838]])


    >>> A = torch.randn(3,5,4)
    >>> l = torch.randn(2,5)
    >>> r = torch.randn(2,4)
    >>> torch.einsum('bn,anm,bm->ba', l, A, r) # compare torch.nn.functional.bilinear
    tensor([[-0.3430, -5.2405,  0.4494],
            [ 0.3311,  5.5201, -3.0356]])


    >>> As = torch.randn(3,2,5)
    >>> Bs = torch.randn(3,5,4)
    >>> torch.einsum('bij,bjk->bik', As, Bs) # batch matrix multiplication
    tensor([[[-1.0564, -1.5904,  3.2023,  3.1271],
             [-1.6706, -0.8097, -0.8025, -2.1183]],

            [[ 4.2239,  0.3107, -0.5756, -0.2354],
             [-1.4558, -0.3460,  1.5087, -0.8530]],

            [[ 2.8153,  1.8787, -4.3839, -1.2112],
             [ 0.3728, -2.1131,  0.0921,  0.8305]]])

    >>> A = torch.randn(3, 3)
    >>> torch.einsum('ii->i', A) # diagonal
    tensor([-0.7825,  0.8291, -0.1936])

    >>> A = torch.randn(4, 3, 3)
    >>> torch.einsum('...ii->...i', A) # batch diagonal
    tensor([[-1.0864,  0.7292,  0.0569],
            [-0.9725, -1.0270,  0.6493],
            [ 0.5832, -1.1716, -1.5084],
            [ 0.4041, -1.1690,  0.8570]])

    >>> A = torch.randn(2, 3, 4, 5)
    >>> torch.einsum('...ij->...ji', A).shape # batch permute
    torch.Size([2, 3, 5, 4])
"""
    if not torch.jit.is_scripting():
        if any(type(t) is not Tensor for t in operands) and has_torch_function(operands):
            return handle_torch_function(einsum, operands, equation, *operands)

    if len(operands) == 1 and isinstance(operands[0], (list, tuple)):
        # the old interface of passing the operands as one list argument
        operands = operands[0]
        # recurse incase operands contains value that has torch function
        # in the original implementation this line is omitted
        return einsum(equation, *operands)

    return _VF.einsum(equation, operands)


def meshgrid(*tensors):
    r"""Take :math:`N` tensors, each of which can be either scalar or 1-dimensional
vector, and create :math:`N` N-dimensional grids, where the :math:`i` :sup:`th` grid is defined by
expanding the :math:`i` :sup:`th` input over dimensions defined by other inputs.


    Args:
        tensors (list of Tensor): list of scalars or 1 dimensional tensors. Scalars will be
        treated as tensors of size :math:`(1,)` automatically

    Returns:
        seq (sequence of Tensors): If the input has :math:`k` tensors of size
        :math:`(N_1,), (N_2,), \ldots , (N_k,)`, then the output would also have :math:`k` tensors,
        where all tensors are of size :math:`(N_1, N_2, \ldots , N_k)`.

    Example::

        >>> x = torch.tensor([1, 2, 3])
        >>> y = torch.tensor([4, 5, 6])
        >>> grid_x, grid_y = torch.meshgrid(x, y)
        >>> grid_x
        tensor([[1, 1, 1],
                [2, 2, 2],
                [3, 3, 3]])
        >>> grid_y
        tensor([[4, 5, 6],
                [4, 5, 6],
                [4, 5, 6]])
    """
    if not torch.jit.is_scripting():
        if any(type(t) is not Tensor for t in tensors) and has_torch_function(tensors):
            return handle_torch_function(meshgrid, tensors, *tensors)
    if len(tensors) == 1 and isinstance(tensors[0], (list, tuple)):
        # the old interface of passing the operands as one list argument
        tensors = tensors[0]
    return _VF.meshgrid(tensors)


def stft(input, n_fft, hop_length=None, win_length=None, window=None,
         center=True, pad_mode='reflect', normalized=False, onesided=True):
    # type: (Tensor, int, Optional[int], Optional[int], Optional[Tensor], bool, str, bool, bool) -> Tensor
    r"""Short-time Fourier transform (STFT).

    Ignoring the optional batch dimension, this method computes the following
    expression:

    .. math::
        X[m, \omega] = \sum_{k = 0}^{\text{win\_length-1}}%
                            \text{window}[k]\ \text{input}[m \times \text{hop\_length} + k]\ %
                            \exp\left(- j \frac{2 \pi \cdot \omega k}{\text{win\_length}}\right),

    where :math:`m` is the index of the sliding window, and :math:`\omega` is
    the frequency that :math:`0 \leq \omega < \text{n\_fft}`. When
    :attr:`onesided` is the default value ``True``,

    * :attr:`input` must be either a 1-D time sequence or a 2-D batch of time
      sequences.

    * If :attr:`hop_length` is ``None`` (default), it is treated as equal to
      ``floor(n_fft / 4)``.

    * If :attr:`win_length` is ``None`` (default), it is treated as equal to
      :attr:`n_fft`.

    * :attr:`window` can be a 1-D tensor of size :attr:`win_length`, e.g., from
      :meth:`torch.hann_window`. If :attr:`window` is ``None`` (default), it is
      treated as if having :math:`1` everywhere in the window. If
      :math:`\text{win\_length} < \text{n\_fft}`, :attr:`window` will be padded on
      both sides to length :attr:`n_fft` before being applied.

    * If :attr:`center` is ``True`` (default), :attr:`input` will be padded on
      both sides so that the :math:`t`-th frame is centered at time
      :math:`t \times \text{hop\_length}`. Otherwise, the :math:`t`-th frame
      begins at time  :math:`t \times \text{hop\_length}`.

    * :attr:`pad_mode` determines the padding method used on :attr:`input` when
      :attr:`center` is ``True``. See :meth:`torch.nn.functional.pad` for
      all available options. Default is ``"reflect"``.

    * If :attr:`onesided` is ``True`` (default), only values for :math:`\omega`
      in :math:`\left[0, 1, 2, \dots, \left\lfloor \frac{\text{n\_fft}}{2} \right\rfloor + 1\right]`
      are returned because the real-to-complex Fourier transform satisfies the
      conjugate symmetry, i.e., :math:`X[m, \omega] = X[m, \text{n\_fft} - \omega]^*`.

    * If :attr:`normalized` is ``True`` (default is ``False``), the function
      returns the normalized STFT results, i.e., multiplied by :math:`(\text{frame\_length})^{-0.5}`.

    Returns the real and the imaginary parts together as one tensor of size
    :math:`(* \times N \times T \times 2)`, where :math:`*` is the optional
    batch size of :attr:`input`, :math:`N` is the number of frequencies where
    STFT is applied, :math:`T` is the total number of frames used, and each pair
    in the last dimension represents a complex number as the real part and the
    imaginary part.

    .. warning::
      This function changed signature at version 0.4.1. Calling with the
      previous signature may cause error or return incorrect result.

    Arguments:
        input (Tensor): the input tensor
        n_fft (int): size of Fourier transform
        hop_length (int, optional): the distance between neighboring sliding window
            frames. Default: ``None`` (treated as equal to ``floor(n_fft / 4)``)
        win_length (int, optional): the size of window frame and STFT filter.
            Default: ``None``  (treated as equal to :attr:`n_fft`)
        window (Tensor, optional): the optional window function.
            Default: ``None`` (treated as window of all :math:`1` s)
        center (bool, optional): whether to pad :attr:`input` on both sides so
            that the :math:`t`-th frame is centered at time :math:`t \times \text{hop\_length}`.
            Default: ``True``
        pad_mode (string, optional): controls the padding method used when
            :attr:`center` is ``True``. Default: ``"reflect"``
        normalized (bool, optional): controls whether to return the normalized STFT results
             Default: ``False``
        onesided (bool, optional): controls whether to return half of results to
            avoid redundancy Default: ``True``

    Returns:
        Tensor: A tensor containing the STFT result with shape described above

    """
    if not torch.jit.is_scripting():
        if type(input) is not Tensor and has_torch_function((input,)):
            return handle_torch_function(
                stft, (input,), input, n_fft, hop_length=hop_length, win_length=win_length,
                window=window, center=center, pad_mode=pad_mode, normalized=normalized,
                onesided=onesided)
    # TODO: after having proper ways to map Python strings to ATen Enum, move
    #       this and F.pad to ATen.
    if center:
        signal_dim = input.dim()
        extended_shape = [1] * (3 - signal_dim) + list(input.size())
        pad = int(n_fft // 2)
        input = F.pad(input.view(extended_shape), (pad, pad), pad_mode)
        input = input.view(input.shape[-signal_dim:])
        
    # STFT patch for aarch64
    # https://stackoverflow.com/a/66872148
    librosa_stft = librosa.stft(input.cpu().detach().numpy().reshape(-1), n_fft, hop_length, win_length, window="hann", center=center, pad_mode=pad_mode)
    librosa_stft = np.array([[a.real, a.imag] for a in librosa_stft])
    librosa_stft = np.transpose(librosa_stft, axes=[0, 2, 1])
    librosa_stft = np.expand_dims(librosa_stft, 0)
    librosa_stft = torch.from_numpy(librosa_stft)
    return librosa_stft
    #return _VF.stft(input, n_fft, hop_length, win_length, window,  # type: ignore
    #                normalized, onesided, return_complex)


def istft(input, n_fft, hop_length=None, win_length=None, window=None,
          center=True, normalized=False, onesided=True, length=None):
    # type: (Tensor, int, Optional[int], Optional[int], Optional[Tensor], bool, bool, bool, Optional[int]) -> Tensor
    r"""Inverse short time Fourier Transform. This is expected to be the inverse of :func:`~torch.stft`.
    It has the same parameters (+ additional optional parameter of :attr:`length`) and it should return the
    least squares estimation of the original signal. The algorithm will check using the NOLA condition (
    nonzero overlap).

    Important consideration in the parameters :attr:`window` and :attr:`center` so that the envelop
    created by the summation of all the windows is never zero at certain point in time. Specifically,
    :math:`\sum_{t=-\infty}^{\infty} w^2[n-t\times hop\_length] \cancel{=} 0`.

    Since :func:`~torch.stft` discards elements at the end of the signal if they do not fit in a frame,
    ``istft`` may return a shorter signal than the original signal (can occur if :attr:`center` is False
    since the signal isn't padded).

    If :attr:`center` is ``True``, then there will be padding e.g. ``'constant'``, ``'reflect'``, etc.
    Left padding can be trimmed off exactly because they can be calculated but right padding cannot be
    calculated without additional information.

    Example: Suppose the last window is:
    ``[17, 18, 0, 0, 0]`` vs ``[18, 0, 0, 0, 0]``

    The :attr:`n_fft`, :attr:`hop_length`, :attr:`win_length` are all the same which prevents the calculation
    of right padding. These additional values could be zeros or a reflection of the signal so providing
    :attr:`length` could be useful. If :attr:`length` is ``None`` then padding will be aggressively removed
    (some loss of signal).

    [1] D. W. Griffin and J. S. Lim, "Signal estimation from modified short-time Fourier transform,"
    IEEE Trans. ASSP, vol.32, no.2, pp.236-243, Apr. 1984.

    Arguments:
        input (Tensor): The input tensor. Expected to be output of :func:`~torch.stft`,
            either 3D (``fft_size``, ``n_frame``, 2) or 4D (``channel``, ``fft_size``, ``n_frame``, 2).
        n_fft (int): Size of Fourier transform
        hop_length (Optional[int]): The distance between neighboring sliding window frames.
            (Default: ``n_fft // 4``)
        win_length (Optional[int]): The size of window frame and STFT filter. (Default: ``n_fft``)
        window (Optional[torch.Tensor]): The optional window function.
            (Default: ``torch.ones(win_length)``)
        center (bool): Whether :attr:`input` was padded on both sides so that the :math:`t`-th frame is
            centered at time :math:`t \times \text{hop\_length}`.
            (Default: ``True``)
        normalized (bool): Whether the STFT was normalized. (Default: ``False``)
        onesided (bool): Whether the STFT is onesided. (Default: ``True``)
        length (Optional[int]): The amount to trim the signal by (i.e. the
            original signal length). (Default: whole signal)

    Returns:
        Tensor: Least squares estimation of the original signal of size (..., signal_length)
    """
    if not torch.jit.is_scripting():
        if type(input) is not Tensor and has_torch_function((input,)):
            return handle_torch_function(
                istft, (input,), input, n_fft, hop_length=hop_length, win_length=win_length,
                window=window, center=center, normalized=normalized, onesided=onesided,
                length=length)

    return _VF.istft(
        input, n_fft, hop_length, win_length, window, center, normalized, onesided, length)


del torch.unique_dim


def _unique_impl(input, sorted=True, return_inverse=False, return_counts=False, dim=None):
    # type: (Tensor, bool, bool, bool, Optional[int]) -> Tuple[Tensor, Tensor, Tensor]
    r"""Returns the unique elements of the input tensor.

    .. note:: This function is different from :func:`torch.unique_consecutive` in the sense that
        this function also eliminates non-consecutive duplicate values.

    .. note:: Currently in the CUDA implementation and the CPU implementation when dim is specified,
        `torch.unique` always sort the tensor at the beginning regardless of the `sort` argument.
        Sorting could be slow, so if your input tensor is already sorted, it is recommended to use
        :func:`torch.unique_consecutive` which avoids the sorting.

    Arguments:
        input (Tensor): the input tensor
        sorted (bool): Whether to sort the unique elements in ascending order
            before returning as output.
        return_inverse (bool): Whether to also return the indices for where
            elements in the original input ended up in the returned unique list.
        return_counts (bool): Whether to also return the counts for each unique
            element.
        dim (int): the dimension to apply unique. If ``None``, the unique of the
            flattened input is returned. default: ``None``

    Returns:
        (Tensor, Tensor (optional), Tensor (optional)): A tensor or a tuple of tensors containing

            - **output** (*Tensor*): the output list of unique scalar elements.
            - **inverse_indices** (*Tensor*): (optional) if
              :attr:`return_inverse` is True, there will be an additional
              returned tensor (same shape as input) representing the indices
              for where elements in the original input map to in the output;
              otherwise, this function will only return a single tensor.
            - **counts** (*Tensor*): (optional) if
              :attr:`return_counts` is True, there will be an additional
              returned tensor (same shape as output or output.size(dim),
              if dim was specified) representing the number of occurrences
              for each unique value or tensor.

    Example::

        >>> output = torch.unique(torch.tensor([1, 3, 2, 3], dtype=torch.long))
        >>> output
        tensor([ 2,  3,  1])

        >>> output, inverse_indices = torch.unique(
                torch.tensor([1, 3, 2, 3], dtype=torch.long), sorted=True, return_inverse=True)
        >>> output
        tensor([ 1,  2,  3])
        >>> inverse_indices
        tensor([ 0,  2,  1,  2])

        >>> output, inverse_indices = torch.unique(
                torch.tensor([[1, 3], [2, 3]], dtype=torch.long), sorted=True, return_inverse=True)
        >>> output
        tensor([ 1,  2,  3])
        >>> inverse_indices
        tensor([[ 0,  2],
                [ 1,  2]])

    """
    if not torch.jit.is_scripting():
        if type(input) is not Tensor and has_torch_function((input,)):
            return handle_torch_function(
                unique, (input,), input, sorted=sorted, return_inverse=return_inverse,
                return_counts=return_counts, dim=dim)

    if dim is not None:
        output, inverse_indices, counts = _VF.unique_dim(
            input,
            dim,
            sorted=sorted,
            return_inverse=return_inverse,
            return_counts=return_counts,
        )
    else:
        output, inverse_indices, counts = torch._unique2(
            input,
            sorted=sorted,
            return_inverse=return_inverse,
            return_counts=return_counts,
        )
    return output, inverse_indices, counts


def _unique_consecutive_impl(input, return_inverse=False, return_counts=False, dim=None):
    # type: (Tensor, bool, bool, Optional[int]) -> Tuple[Tensor, Tensor, Tensor]
    r"""Eliminates all but the first element from every consecutive group of equivalent elements.

    .. note:: This function is different from :func:`torch.unique` in the sense that this function
        only eliminates consecutive duplicate values. This semantics is similar to `std::unique`
        in C++.

    Arguments:
        input (Tensor): the input tensor
        return_inverse (bool): Whether to also return the indices for where
            elements in the original input ended up in the returned unique list.
        return_counts (bool): Whether to also return the counts for each unique
            element.
        dim (int): the dimension to apply unique. If ``None``, the unique of the
            flattened input is returned. default: ``None``

    Returns:
        (Tensor, Tensor (optional), Tensor (optional)): A tensor or a tuple of tensors containing

            - **output** (*Tensor*): the output list of unique scalar elements.
            - **inverse_indices** (*Tensor*): (optional) if
              :attr:`return_inverse` is True, there will be an additional
              returned tensor (same shape as input) representing the indices
              for where elements in the original input map to in the output;
              otherwise, this function will only return a single tensor.
            - **counts** (*Tensor*): (optional) if
              :attr:`return_counts` is True, there will be an additional
              returned tensor (same shape as output or output.size(dim),
              if dim was specified) representing the number of occurrences
              for each unique value or tensor.

    Example::

        >>> x = torch.tensor([1, 1, 2, 2, 3, 1, 1, 2])
        >>> output = torch.unique_consecutive(x)
        >>> output
        tensor([1, 2, 3, 1, 2])

        >>> output, inverse_indices = torch.unique_consecutive(x, return_inverse=True)
        >>> output
        tensor([1, 2, 3, 1, 2])
        >>> inverse_indices
        tensor([0, 0, 1, 1, 2, 3, 3, 4])

        >>> output, counts = torch.unique_consecutive(x, return_counts=True)
        >>> output
        tensor([1, 2, 3, 1, 2])
        >>> counts
        tensor([2, 2, 1, 2, 1])
    """
    if not torch.jit.is_scripting():
        if type(input) is not Tensor and has_torch_function((input,)):
            return handle_torch_function(
                unique_consecutive, (input,), input, return_inverse=return_inverse,
                return_counts=return_counts, dim=dim)
    output, inverse_indices, counts = _VF.unique_consecutive(
        input, return_inverse=return_inverse, return_counts=return_counts, dim=dim)
    return output, inverse_indices, counts


def _return_counts(input, sorted=True, return_inverse=False, return_counts=False, dim=None):
    # type: (Tensor, bool, bool, bool, Optional[int]) -> Tuple[Tensor, Tensor]

    if not torch.jit.is_scripting():
        if type(input) is not Tensor and has_torch_function((input,)):
            return _unique_impl(input, sorted, return_inverse, return_counts, dim)

    output, _, counts = _unique_impl(input, sorted, return_inverse, return_counts, dim)
    return output, counts

def _return_output(input, sorted=True, return_inverse=False, return_counts=False, dim=None):
    # type: (Tensor, bool, bool, bool, Optional[int]) -> Tensor

    if not torch.jit.is_scripting():
        if type(input) is not Tensor and has_torch_function((input,)):
            return _unique_impl(input, sorted, return_inverse, return_counts, dim)

    output, _, _ = _unique_impl(input, sorted, return_inverse, return_counts, dim)
    return output

def _return_inverse(input, sorted=True, return_inverse=False, return_counts=False, dim=None):
    # type: (Tensor, bool, bool, bool, Optional[int]) -> Tuple[Tensor, Tensor]

    if not torch.jit.is_scripting():
        if type(input) is not Tensor and has_torch_function((input,)):
            return _unique_impl(input, sorted, return_inverse, return_counts, dim)

    output, inverse_indices, _ = _unique_impl(input, sorted, return_inverse, return_counts, dim)
    return output, inverse_indices

_return_inverse_false = boolean_dispatch(
    arg_name='return_counts',
    arg_index=3,
    default=False,
    if_true=_return_counts,
    if_false=_return_output,
    module_name=__name__,
    func_name='unique')

_return_inverse_true = boolean_dispatch(
    arg_name='return_counts',
    arg_index=3,
    default=False,
    if_true=_unique_impl,
    if_false=_return_inverse,
    module_name=__name__,
    func_name='unique')

# The return type of unique depends on `return_inverse`, and `return_counts` so in order to
# resolve the output type in TorchScript we need to statically know the value of both parameters

unique = boolean_dispatch(
    arg_name='return_inverse',
    arg_index=2,
    default=False,
    if_true=_return_inverse_true,
    if_false=_return_inverse_false,
    module_name=__name__,
    func_name='unique')
unique.__doc__ = _unique_impl.__doc__


def _consecutive_return_counts(input, return_inverse=False, return_counts=False, dim=None):
    # type: (Tensor, bool, bool, Optional[int]) -> Tuple[Tensor, Tensor]

    if not torch.jit.is_scripting():
        if type(input) is not Tensor and has_torch_function((input,)):
            return _unique_consecutive_impl(input, return_inverse, return_counts, dim)

    output, _, counts = _unique_consecutive_impl(input, return_inverse, return_counts, dim)
    return output, counts

def _consecutive_return_output(input, return_inverse=False, return_counts=False, dim=None):
    # type: (Tensor, bool, bool, Optional[int]) -> Tensor

    if not torch.jit.is_scripting():
        if type(input) is not Tensor and has_torch_function((input,)):
            return _unique_consecutive_impl(input, return_inverse, return_counts, dim)

    output, _, _ = _unique_consecutive_impl(input, return_inverse, return_counts, dim)
    return output

def _consecutive_return_inverse(input, return_inverse=False, return_counts=False, dim=None):
    # type: (Tensor, bool, bool, Optional[int]) -> Tuple[Tensor, Tensor]

    if not torch.jit.is_scripting():
        if type(input) is not Tensor and has_torch_function((input,)):
            return _unique_consecutive_impl(input, return_inverse, return_counts, dim)

    output, inverse_indices, _ = _unique_consecutive_impl(input, return_inverse, return_counts, dim)
    return output, inverse_indices

_consecutive_return_inverse_false = boolean_dispatch(
    arg_name='return_counts',
    arg_index=1,
    default=False,
    if_true=_consecutive_return_counts,
    if_false=_consecutive_return_output,
    module_name=__name__,
    func_name='unique_consecutive')

_consecutive_return_inverse_true = boolean_dispatch(
    arg_name='return_counts',
    arg_index=1,
    default=False,
    if_true=_unique_consecutive_impl,
    if_false=_consecutive_return_inverse,
    module_name=__name__,
    func_name='unique_consecutive')

# The return type of unique depends on `return_inverse`, and `return_counts` so in order to
# resolve the output type in TorchScript we need to statically know the value of both parameters

unique_consecutive = boolean_dispatch(
    arg_name='return_inverse',
    arg_index=2,
    default=False,
    if_true=_consecutive_return_inverse_true,
    if_false=_consecutive_return_inverse_false,
    module_name=__name__,
    func_name='unique_consecutive')
unique_consecutive.__doc__ = _unique_consecutive_impl.__doc__


def tensordot(a, b, dims=2):
    r"""Returns a contraction of a and b over multiple dimensions.

    :attr:`tensordot` implements a generalized matrix product.

    Args:
      a (Tensor): Left tensor to contract
      b (Tensor): Right tensor to contract
      dims (int or tuple of two lists of integers): number of dimensions to
         contract or explicit lists of dimensions for :attr:`a` and
         :attr:`b` respectively

    When called with a non-negative integer argument :attr:`dims` = :math:`d`, and
    the number of dimensions of :attr:`a` and :attr:`b` is :math:`m` and :math:`n`,
    respectively, :func:`~torch.tensordot` computes

    .. math::
        r_{i_0,...,i_{m-d}, i_d,...,i_n}
          = \sum_{k_0,...,k_{d-1}} a_{i_0,...,i_{m-d},k_0,...,k_{d-1}} \times b_{k_0,...,k_{d-1}, i_d,...,i_n}.

    When called with :attr:`dims` of the list form, the given dimensions will be contracted
    in place of the last :math:`d` of :attr:`a` and the first :math:`d` of :math:`b`. The sizes
    in these dimensions must match, but :func:`~torch.tensordot` will deal with broadcasted
    dimensions.

    Examples::

        >>> a = torch.arange(60.).reshape(3, 4, 5)
        >>> b = torch.arange(24.).reshape(4, 3, 2)
        >>> torch.tensordot(a, b, dims=([1, 0], [0, 1]))
        tensor([[4400., 4730.],
                [4532., 4874.],
                [4664., 5018.],
                [4796., 5162.],
                [4928., 5306.]])

        >>> a = torch.randn(3, 4, 5, device='cuda')
        >>> b = torch.randn(4, 5, 6, device='cuda')
        >>> c = torch.tensordot(a, b, dims=2).cpu()
        tensor([[ 8.3504, -2.5436,  6.2922,  2.7556, -1.0732,  3.2741],
                [ 3.3161,  0.0704,  5.0187, -0.4079, -4.3126,  4.8744],
                [ 0.8223,  3.9445,  3.2168, -0.2400,  3.4117,  1.7780]])

    """
    if not torch.jit.is_scripting():
        if (type(a) is not Tensor or type(b) is not Tensor) and has_torch_function((a, b)):
            return handle_torch_function(tensordot, (a, b), a, b, dims=dims)
    if isinstance(dims, (list, tuple)) or \
       (isinstance(dims, torch.Tensor) and dims.numel() > 1):
        dims_a, dims_b = dims
    else:
        if isinstance(dims, torch.Tensor):
            dims = dims.item()
        if dims < 0:
            raise RuntimeError("tensordot expects dims >= 0, but got dims={}".format(dims))
        dims_a = list(range(-dims, 0))
        dims_b = list(range(dims))
    return _VF.tensordot(a, b, dims_a, dims_b)

def cartesian_prod(*tensors):
    """Do cartesian product of the given sequence of tensors. The behavior is similar to
    python's `itertools.product`.

    Arguments:
        *tensors: any number of 1 dimensional tensors.

    Returns:
        Tensor: A tensor equivalent to converting all the input tensors into lists,
            do `itertools.product` on these lists, and finally convert the resulting list
            into tensor.

    Example::

        >>> a = [1, 2, 3]
        >>> b = [4, 5]
        >>> list(itertools.product(a, b))
        [(1, 4), (1, 5), (2, 4), (2, 5), (3, 4), (3, 5)]
        >>> tensor_a = torch.tensor(a)
        >>> tensor_b = torch.tensor(b)
        >>> torch.cartesian_prod(tensor_a, tensor_b)
        tensor([[1, 4],
                [1, 5],
                [2, 4],
                [2, 5],
                [3, 4],
                [3, 5]])
    """
    if not torch.jit.is_scripting():
        if any(type(t) is not Tensor for t in tensors) and has_torch_function(tensors):
            return handle_torch_function(cartesian_prod, tensors, *tensors)
    return _VF.cartesian_prod(tensors)

def block_diag(*tensors):
    """Create a block diagonal matrix from provided tensors.

    Arguments:
        *tensors: One or more tensors with 0, 1, or 2 dimensions.

    Returns:
        Tensor: A 2 dimensional tensor with all the input tensors arranged in
            order such that their upper left and lower right corners are
            diagonally adjacent. All other elements are set to 0.

    Example::

        >>> import torch
        >>> A = torch.tensor([[0, 1], [1, 0]])
        >>> B = torch.tensor([[3, 4, 5], [6, 7, 8]])
        >>> C = torch.tensor(7)
        >>> D = torch.tensor([1, 2, 3])
        >>> E = torch.tensor([[4], [5], [6]])
        >>> torch.block_diag(A, B, C, D, E)
        tensor([[0, 1, 0, 0, 0, 0, 0, 0, 0, 0],
                [1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                [0, 0, 3, 4, 5, 0, 0, 0, 0, 0],
                [0, 0, 6, 7, 8, 0, 0, 0, 0, 0],
                [0, 0, 0, 0, 0, 7, 0, 0, 0, 0],
                [0, 0, 0, 0, 0, 0, 1, 2, 3, 0],
                [0, 0, 0, 0, 0, 0, 0, 0, 0, 4],
                [0, 0, 0, 0, 0, 0, 0, 0, 0, 5],
                [0, 0, 0, 0, 0, 0, 0, 0, 0, 6]])
    """
    if any(type(t) is not Tensor for t in tensors) and has_torch_function(tensors):
        return handle_torch_function(block_diag, tensors, *tensors)
    return torch._C._VariableFunctions.block_diag(tensors)


def cdist(x1, x2, p=2., compute_mode='use_mm_for_euclid_dist_if_necessary'):
    # type: (Tensor, Tensor, float, str) -> (Tensor)
    r"""Computes batched the p-norm distance between each pair of the two collections of row vectors.

    Args:
        x1 (Tensor): input tensor of shape :math:`B \times P \times M`.
        x2 (Tensor): input tensor of shape :math:`B \times R \times M`.
        p: p value for the p-norm distance to calculate between each vector pair
            :math:`\in [0, \infty]`.
        compute_mode:
            'use_mm_for_euclid_dist_if_necessary' - will use matrix multiplication approach to calculate
            euclidean distance (p = 2) if P > 25 or R > 25
            'use_mm_for_euclid_dist' - will always use matrix multiplication approach to calculate
            euclidean distance (p = 2)
            'donot_use_mm_for_euclid_dist' - will never use matrix multiplication approach to calculate
            euclidean distance (p = 2)
            Default: use_mm_for_euclid_dist_if_necessary.

    If x1 has shape :math:`B \times P \times M` and x2 has shape :math:`B \times R \times M` then the
    output will have shape :math:`B \times P \times R`.

    This function is equivalent to `scipy.spatial.distance.cdist(input,'minkowski', p=p)`
    if :math:`p \in (0, \infty)`. When :math:`p = 0` it is equivalent to
    `scipy.spatial.distance.cdist(input, 'hamming') * M`. When :math:`p = \infty`, the closest
    scipy function is `scipy.spatial.distance.cdist(xn, lambda x, y: np.abs(x - y).max())`.

    Example:

        >>> a = torch.tensor([[0.9041,  0.0196], [-0.3108, -2.4423], [-0.4821,  1.059]])
        >>> a
        tensor([[ 0.9041,  0.0196],
                [-0.3108, -2.4423],
                [-0.4821,  1.0590]])
        >>> b = torch.tensor([[-2.1763, -0.4713], [-0.6986,  1.3702]])
        >>> b
        tensor([[-2.1763, -0.4713],
                [-0.6986,  1.3702]])
        >>> torch.cdist(a, b, p=2)
        tensor([[3.1193, 2.0959],
                [2.7138, 3.8322],
                [2.2830, 0.3791]])
    """
    if not torch.jit.is_scripting():
        if (type(x1) is not Tensor or type(x2) is not Tensor) and has_torch_function((x1, x2)):
            return handle_torch_function(
                cdist, (x1, x2), x1, x2, p=p, compute_mode=compute_mode)
    if compute_mode == 'use_mm_for_euclid_dist_if_necessary':
        return _VF.cdist(x1, x2, p, None)
    elif compute_mode == 'use_mm_for_euclid_dist':
        return _VF.cdist(x1, x2, p, 1)
    elif compute_mode == 'donot_use_mm_for_euclid_dist':
        return _VF.cdist(x1, x2, p, 2)
    else:
        raise ValueError("{} is not a valid value for compute_mode".format(compute_mode))

# TODO: type dim as BroadcastingList when https://github.com/pytorch/pytorch/issues/33782 is fixed
@overload  # noqa: 749
def norm(input, p="fro", dim=None, keepdim=False, out=None, dtype=None):  # noqa: 749
    # type: (Tensor, str, Optional[List[int]], bool, Optional[Tensor], Optional[int]) -> Tensor
    pass

@overload  # noqa: 749
def norm(input, p="fro", dim=None, keepdim=False, out=None, dtype=None):  # noqa: 749
    # type: (Tensor, Optional[number], Optional[List[int]], bool, Optional[Tensor], Optional[int]) -> Tensor
    pass

@overload  # noqa: 749
def norm(input, p="fro", dim=None, keepdim=False, out=None, dtype=None):  # noqa: 749
    # type: (Tensor, Optional[number], Optional[int], bool, Optional[Tensor], Optional[int]) -> Tensor
    pass

@overload  # noqa: 749
def norm(input, p="fro", dim=None, keepdim=False, out=None, dtype=None):  # noqa: 749
    # type: (Tensor, str, Optional[int], bool, Optional[Tensor], Optional[int]) -> Tensor
    pass

def norm(input, p="fro", dim=None, keepdim=False, out=None, dtype=None):  # noqa: 749
    r"""Returns the matrix norm or vector norm of a given tensor.

    Args:
        input (Tensor): the input tensor
        p (int, float, inf, -inf, 'fro', 'nuc', optional): the order of norm. Default: ``'fro'``
            The following norms can be calculated:

            =====  ============================  ==========================
            ord    matrix norm                   vector norm
            =====  ============================  ==========================
            None   Frobenius norm                2-norm
            'fro'  Frobenius norm                --
            'nuc'  nuclear norm                  --
            Other  as vec norm when dim is None  sum(abs(x)**ord)**(1./ord)
            =====  ============================  ==========================

        dim (int, 2-tuple of ints, 2-list of ints, optional): If it is an int,
            vector norm will be calculated, if it is 2-tuple of ints, matrix norm
            will be calculated. If the value is None, matrix norm will be calculated
            when the input tensor only has two dimensions, vector norm will be
            calculated when the input tensor only has one dimension. If the input
            tensor has more than two dimensions, the vector norm will be applied to
            last dimension.
        keepdim (bool, optional): whether the output tensors have :attr:`dim`
            retained or not. Ignored if :attr:`dim` = ``None`` and
            :attr:`out` = ``None``. Default: ``False``
        out (Tensor, optional): the output tensor. Ignored if
            :attr:`dim` = ``None`` and :attr:`out` = ``None``.
        dtype (:class:`torch.dtype`, optional): the desired data type of
            returned tensor. If specified, the input tensor is casted to
            :attr:'dtype' while performing the operation. Default: None.


    Example::

        >>> import torch
        >>> a = torch.arange(9, dtype= torch.float) - 4
        >>> b = a.reshape((3, 3))
        >>> torch.norm(a)
        tensor(7.7460)
        >>> torch.norm(b)
        tensor(7.7460)
        >>> torch.norm(a, float('inf'))
        tensor(4.)
        >>> torch.norm(b, float('inf'))
        tensor(4.)
        >>> c = torch.tensor([[ 1, 2, 3],[-1, 1, 4]] , dtype= torch.float)
        >>> torch.norm(c, dim=0)
        tensor([1.4142, 2.2361, 5.0000])
        >>> torch.norm(c, dim=1)
        tensor([3.7417, 4.2426])
        >>> torch.norm(c, p=1, dim=1)
        tensor([6., 6.])
        >>> d = torch.arange(8, dtype= torch.float).reshape(2,2,2)
        >>> torch.norm(d, dim=(1,2))
        tensor([ 3.7417, 11.2250])
        >>> torch.norm(d[0, :, :]), torch.norm(d[1, :, :])
        (tensor(3.7417), tensor(11.2250))
    """
    if not torch.jit.is_scripting():
        if type(input) is not Tensor and has_torch_function((input,)):
            return handle_torch_function(
                norm, (input,), input, p=p, dim=dim, keepdim=keepdim, out=out, dtype=dtype)

    ndim = input.dim()


    # catch default case
    if dim is None and out is None and dtype is None and p is not None:
        if isinstance(p, str):
            if p == "fro":
                return _VF.frobenius_norm(input)
        if not isinstance(p, str):
            return _VF.norm(input, p)

    # TODO: when https://github.com/pytorch/pytorch/issues/33782 is fixed
    # remove the overloads where dim is an int and replace with BraodcastingList1
    # and remove next four lines, replace _dim with dim
    if dim is not None:
        if isinstance(dim, int):
            _dim = [dim]
        else:
            _dim = dim
    else:
        _dim = None

    if isinstance(p, str):
        if p == "fro":
            if dtype is not None:
                raise ValueError("dtype argument is not supported in frobenius norm")

            if _dim is None:
                _dim = [i for i in range(ndim)]  # noqa: C416 TODO: rewrite as list(range(m))
            if out is None:
                return _VF.frobenius_norm(input, _dim, keepdim=keepdim)
            else:
                return _VF.frobenius_norm(input, _dim, keepdim=keepdim, out=out)
        elif p == "nuc":
            if dtype is not None:
                raise ValueError("dtype argument is not supported in nuclear norm")
            if _dim is None:
                if out is None:
                    return _VF.nuclear_norm(input, keepdim=keepdim)
                else:
                    return _VF.nuclear_norm(input, keepdim=keepdim, out=out)
            else:
                if out is None:
                    return _VF.nuclear_norm(input, _dim, keepdim=keepdim)
                else:
                    return _VF.nuclear_norm(input, _dim, keepdim=keepdim, out=out)
        raise RuntimeError("only valid string values are 'fro' and 'nuc', found {}".format(p))
    else:
        if _dim is None:
            _dim = [i for i in range(ndim)]  # noqa: C416 TODO: rewrite as list(range(m))

        if out is None:
            if dtype is None:
                return _VF.norm(input, p, _dim, keepdim=keepdim)
            else:
                return _VF.norm(input, p, _dim, keepdim=keepdim, dtype=dtype)
        else:
            if dtype is None:
                return _VF.norm(input, p, _dim, keepdim=keepdim, out=out)
            else:
                return _VF.norm(input, p, _dim, keepdim=keepdim, dtype=dtype, out=out)

def chain_matmul(*matrices):
    r"""Returns the matrix product of the :math:`N` 2-D tensors. This product is efficiently computed
    using the matrix chain order algorithm which selects the order in which incurs the lowest cost in terms
    of arithmetic operations (`[CLRS]`_). Note that since this is a function to compute the product, :math:`N`
    needs to be greater than or equal to 2; if equal to 2 then a trivial matrix-matrix product is returned.
    If :math:`N` is 1, then this is a no-op - the original matrix is returned as is.


    Args:
        matrices (Tensors...): a sequence of 2 or more 2-D tensors whose product is to be determined.


    Returns:
        Tensor: if the :math:`i^{th}` tensor was of dimensions :math:`p_{i} \times p_{i + 1}`, then the product
        would be of dimensions :math:`p_{1} \times p_{N + 1}`.

    Example::

        >>> a = torch.randn(3, 4)
        >>> b = torch.randn(4, 5)
        >>> c = torch.randn(5, 6)
        >>> d = torch.randn(6, 7)
        >>> torch.chain_matmul(a, b, c, d)
        tensor([[ -2.3375,  -3.9790,  -4.1119,  -6.6577,   9.5609, -11.5095,  -3.2614],
                [ 21.4038,   3.3378,  -8.4982,  -5.2457, -10.2561,  -2.4684,   2.7163],
                [ -0.9647,  -5.8917,  -2.3213,  -5.2284,  12.8615, -12.2816,  -2.5095]])

    .. _`[CLRS]`: https://mitpress.mit.edu/books/introduction-algorithms-third-edition
    """
    if not torch.jit.is_scripting():
        if any(type(t) is not Tensor for t in matrices) and has_torch_function(matrices):
            return handle_torch_function(chain_matmul, matrices, *matrices)
    return _VF.chain_matmul(matrices)


def _lu_impl(A, pivot=True, get_infos=False, out=None):
    # type: (Tensor, bool, bool, Any) -> Tuple[Tensor, Tensor, Tensor]
    r"""Computes the LU factorization of a matrix or batches of matrices
    :attr:`A`. Returns a tuple containing the LU factorization and
    pivots of :attr:`A`.  Pivoting is done if :attr:`pivot` is set to
    ``True``.

    .. note::
        The pivots returned by the function are 1-indexed. If :attr:`pivot` is ``False``,
        then the returned pivots is a tensor filled with zeros of the appropriate size.

    .. note::
        LU factorization with :attr:`pivot` = ``False`` is not available for CPU, and attempting
        to do so will throw an error. However, LU factorization with :attr:`pivot` = ``False`` is
        available for CUDA.

    .. note::
        This function does not check if the factorization was successful or not if
        :attr:`get_infos` is ``True`` since the status of the factorization is present in the
        third element of the return tuple.

    .. note::
        In the case of batches of square matrices with size less or
        equal to 32 on a CUDA device, the LU factorization is repeated
        for singular matrices due to the bug in the MAGMA library (see
        magma issue 13).

    .. note::
       ``L``, ``U``, and ``P`` can be derived using :func:`torch.lu_unpack`.

    Arguments:
        A (Tensor): the tensor to factor of size :math:`(*, m, n)`
        pivot (bool, optional): controls whether pivoting is done. Default: ``True``
        get_infos (bool, optional): if set to ``True``, returns an info IntTensor.
                                    Default: ``False``
        out (tuple, optional): optional output tuple. If :attr:`get_infos` is ``True``,
                               then the elements in the tuple are Tensor, IntTensor,
                               and IntTensor. If :attr:`get_infos` is ``False``, then the
                               elements in the tuple are Tensor, IntTensor. Default: ``None``

    Returns:
        (Tensor, IntTensor, IntTensor (optional)): A tuple of tensors containing

            - **factorization** (*Tensor*): the factorization of size :math:`(*, m, n)`

            - **pivots** (*IntTensor*): the pivots of size :math:`(*, m)`

            - **infos** (*IntTensor*, *optional*): if :attr:`get_infos` is ``True``, this is a tensor of
              size :math:`(*)` where non-zero values indicate whether factorization for the matrix or
              each minibatch has succeeded or failed

    Example::

        >>> A = torch.randn(2, 3, 3)
        >>> A_LU, pivots = torch.lu(A)
        >>> A_LU
        tensor([[[ 1.3506,  2.5558, -0.0816],
                 [ 0.1684,  1.1551,  0.1940],
                 [ 0.1193,  0.6189, -0.5497]],

                [[ 0.4526,  1.2526, -0.3285],
                 [-0.7988,  0.7175, -0.9701],
                 [ 0.2634, -0.9255, -0.3459]]])
        >>> pivots
        tensor([[ 3,  3,  3],
                [ 3,  3,  3]], dtype=torch.int32)
        >>> A_LU, pivots, info = torch.lu(A, get_infos=True)
        >>> if info.nonzero().size(0) == 0:
        ...   print('LU factorization succeeded for all samples!')
        LU factorization succeeded for all samples!
    """
    # If get_infos is True, then we don't need to check for errors and vice versa
    return torch._lu_with_info(A, pivot=pivot, check_errors=(not get_infos))

def _check_list_size(out_len, get_infos, out):
    # type: (int, bool, List[Tensor]) -> None
    get_infos_int = 1 if get_infos else 0
    if out_len - get_infos_int != 2:
        raise TypeError("expected tuple of {} elements but got {}"
                        .format(2 + int(get_infos), len(out_len)))
    if not isinstance(out, (tuple, list)):
        raise TypeError("argument 'out' must be tuple of Tensors, not {}"
                        .format(type(out).__name__))

def _lu_with_infos(A, pivot=True, get_infos=False, out=None):
    # type: (Tensor, bool, bool, Optional[Tuple[Tensor, Tensor, Tensor]]) -> Tuple[Tensor, Tensor, Tensor]
    if not torch.jit.is_scripting():
        if type(A) is not Tensor and has_torch_function((A,)):
            return handle_torch_function(
                lu, (A,), A, pivot=pivot, get_infos=get_infos, out=out)
    result = _lu_impl(A, pivot, get_infos, out)
    if out is not None:
        _check_list_size(len(out), get_infos, out)
        for i in range(len(out)):
            out[i].resize_as_(result[i]).copy_(result[i])
        return out
    else:
        return result  # A_LU, pivots, infos

def _lu_no_infos(A, pivot=True, get_infos=False, out=None):
    # type: (Tensor, bool, bool, Optional[Tuple[Tensor, Tensor]]) -> Tuple[Tensor, Tensor]
    # need to check for torch_function here so that we exit if
    if not torch.jit.is_scripting():
        if type(A) is not Tensor and has_torch_function((A,)):
            return handle_torch_function(
                lu, (A,), A, pivot=pivot, get_infos=get_infos, out=out)
    result = _lu_impl(A, pivot, get_infos, out)
    if out is not None:
        _check_list_size(len(out), get_infos, out)
        for i in range(len(out)):
            out[i].resize_as_(result[i]).copy_(result[i])
        return out
    else:
        return result[0], result[1]  # A_LU, pivots

# The return type of lu depends on `get_infos`, so in order to resolve the output type
# of lu in TorchScript we need to statically know the value of `get_infos`
lu = boolean_dispatch(
    arg_name='get_infos',
    arg_index=2,
    default=False,
    if_true=_lu_with_infos,
    if_false=_lu_no_infos,
    module_name=__name__,
    func_name='lu')
lu.__doc__ = _lu_impl.__doc__

def align_tensors(*tensors):
    raise RuntimeError('`align_tensors` not yet implemented.')


================================================
FILE: patches/pytorch/1.7.0/functional.diff
================================================
4a5,7
> import librosa  # STFT patch for aarch64
> import numpy as np
> 
515,516c518,528
<     return _VF.stft(input, n_fft, hop_length, win_length, window,  # type: ignore
<                     normalized, onesided, return_complex)
---
>         
>     # STFT patch for aarch64
>     # https://stackoverflow.com/a/66872148
>     librosa_stft = librosa.stft(input.cpu().detach().numpy().reshape(-1), n_fft, hop_length, win_length, window="hann", center=center, pad_mode=pad_mode)
>     librosa_stft = np.array([[a.real, a.imag] for a in librosa_stft])
>     librosa_stft = np.transpose(librosa_stft, axes=[0, 2, 1])
>     librosa_stft = np.expand_dims(librosa_stft, 0)
>     librosa_stft = torch.from_numpy(librosa_stft)
>     return librosa_stft
>     #return _VF.stft(input, n_fft, hop_length, win_length, window,  # type: ignore
>     #                normalized, onesided, return_complex)


================================================
FILE: patches/pytorch/1.7.0/functional.original.py
================================================
from typing import (
    Tuple, Optional, Union, Any, Sequence, TYPE_CHECKING
)

import torch
import torch.nn.functional as F
from torch.types import _size
from ._lowrank import svd_lowrank, pca_lowrank
from .overrides import has_torch_function, handle_torch_function
from ._jit_internal import boolean_dispatch, List
from ._jit_internal import _overload as overload

Tensor = torch.Tensor
from torch import _VF

__all__ = [
    'atleast_1d',
    'atleast_2d',
    'atleast_3d',
    'align_tensors',
    'broadcast_tensors',
    'cartesian_prod',
    'block_diag',
    'cdist',
    'chain_matmul',
    'einsum',
    'istft',
    'lu',
    'lu_unpack',
    'norm',
    'meshgrid',
    'pca_lowrank',
    'split',
    'stft',
    'svd_lowrank',
    'tensordot',
    'unique',
    'unique_consecutive',
]


def broadcast_tensors(*tensors):
    r"""broadcast_tensors(*tensors) -> List of Tensors

    Broadcasts the given tensors according to :ref:`broadcasting-semantics`.

    Args:
        *tensors: any number of tensors of the same type

    .. warning::

        More than one element of a broadcasted tensor may refer to a single
        memory location. As a result, in-place operations (especially ones that
        are vectorized) may result in incorrect behavior. If you need to write
        to the tensors, please clone them first.

    Example::

        >>> x = torch.arange(3).view(1, 3)
        >>> y = torch.arange(2).view(2, 1)
        >>> a, b = torch.broadcast_tensors(x, y)
        >>> a.size()
        torch.Size([2, 3])
        >>> a
        tensor([[0, 1, 2],
                [0, 1, 2]])
    """
    if not torch.jit.is_scripting():
        if any(type(t) is not Tensor for t in tensors) and has_torch_function(tensors):
            return handle_torch_function(broadcast_tensors, tensors, *tensors)
    return _VF.broadcast_tensors(tensors)  # type: ignore


def split(tensor, split_size_or_sections, dim=0):
    r"""Splits the tensor into chunks. Each chunk is a view of the original tensor.

    If :attr:`split_size_or_sections` is an integer type, then :attr:`tensor` will
    be split into equally sized chunks (if possible). Last chunk will be smaller if
    the tensor size along the given dimension :attr:`dim` is not divisible by
    :attr:`split_size`.

    If :attr:`split_size_or_sections` is a list, then :attr:`tensor` will be split
    into ``len(split_size_or_sections)`` chunks with sizes in :attr:`dim` according
    to :attr:`split_size_or_sections`.

    Arguments:
        tensor (Tensor): tensor to split.
        split_size_or_sections (int) or (list(int)): size of a single chunk or
            list of sizes for each chunk
        dim (int): dimension along which to split the tensor.

    Example::
        >>> a = torch.arange(10).reshape(5,2)
        >>> a
        tensor([[0, 1],
                [2, 3],
                [4, 5],
                [6, 7],
                [8, 9]])
        >>> torch.split(a, 2)
        (tensor([[0, 1],
                 [2, 3]]),
         tensor([[4, 5],
                 [6, 7]]),
         tensor([[8, 9]]))
        >>> torch.split(a, [1,4])
        (tensor([[0, 1]]),
         tensor([[2, 3],
                 [4, 5],
                 [6, 7],
                 [8, 9]]))
    """
    if not torch.jit.is_scripting():
        if type(tensor) is not Tensor and has_torch_function((tensor,)):
            return handle_torch_function(split, (tensor,), tensor, split_size_or_sections,
                                         dim=dim)
    # Overwriting reason:
    # This dispatches to two ATen functions depending on the type of
    # split_size_or_sections. The branching code is in tensor.py, which we
    # call here.
    return tensor.split(split_size_or_sections, dim)


if TYPE_CHECKING:
    _Indices = _size
else:
    _Indices = List[int]


# equivalent to itertools.product(indices)
def _indices_product(indices: _Indices) -> List[List[int]]:
    empty_list = torch.jit.annotate(List[int], [])
    result = [empty_list]
    for idx in indices:
        result_temp = torch.jit.annotate(List[List[int]], [])
        for res in result:
            for i in range(idx):
                result_temp.append(res + [i])
        result = result_temp
    return result


def _index_tensor_with_indices_list(tensor, indices):
    # type: (Tensor, List[int]) -> Tensor
    out = tensor
    for index in indices:
        out = out[index]
    return out


def lu_unpack(LU_data, LU_pivots, unpack_data=True, unpack_pivots=True):
    # type: (Tensor, Tensor, bool, bool) ->  (Tuple[Optional[Tensor], Optional[Tensor], Optional[Tensor]])
    r"""Unpacks the data and pivots from a LU factorization of a tensor.

    Returns a tuple of tensors as ``(the pivots, the L tensor, the U tensor)``.

    Arguments:
        LU_data (Tensor): the packed LU factorization data
        LU_pivots (Tensor): the packed LU factorization pivots
        unpack_data (bool): flag indicating if the data should be unpacked
        unpack_pivots (bool): flag indicating if the pivots should be unpacked

    Examples::

        >>> A = torch.randn(2, 3, 3)
        >>> A_LU, pivots = A.lu()
        >>> P, A_L, A_U = torch.lu_unpack(A_LU, pivots)
        >>>
        >>> # can recover A from factorization
        >>> A_ = torch.bmm(P, torch.bmm(A_L, A_U))

        >>> # LU factorization of a rectangular matrix:
        >>> A = torch.randn(2, 3, 2)
        >>> A_LU, pivots = A.lu()
        >>> P, A_L, A_U = torch.lu_unpack(A_LU, pivots)
        >>> P
        tensor([[[1., 0., 0.],
                 [0., 1., 0.],
                 [0., 0., 1.]],

                [[0., 0., 1.],
                 [0., 1., 0.],
                 [1., 0., 0.]]])
        >>> A_L
        tensor([[[ 1.0000,  0.0000],
                 [ 0.4763,  1.0000],
                 [ 0.3683,  0.1135]],

                [[ 1.0000,  0.0000],
                 [ 0.2957,  1.0000],
                 [-0.9668, -0.3335]]])
        >>> A_U
        tensor([[[ 2.1962,  1.0881],
                 [ 0.0000, -0.8681]],

                [[-1.0947,  0.3736],
                 [ 0.0000,  0.5718]]])
        >>> A_ = torch.bmm(P, torch.bmm(A_L, A_U))
        >>> torch.norm(A_ - A)
        tensor(2.9802e-08)
    """
    if not torch.jit.is_scripting():
        tens_ops = (LU_data, LU_pivots)
        if any([type(t) is not Tensor for t in tens_ops]) and has_torch_function(tens_ops):
            return handle_torch_function(
                lu_unpack, tens_ops, LU_data, LU_pivots, unpack_data=unpack_data,
                unpack_pivots=unpack_pivots)
    shape = LU_data.shape
    # In generalized LU factorization, the following shape relations hold:
    #   A.shape[-2:] == (m, n)
    #   P.shape[-2:] == (m, m)
    #   L.shape[-2:] == (m, k)
    #   U.shape[-2:] == (k, n)
    # where k = min(m, n)
    m, n = shape[-2:]
    k = min(m, n)
    if unpack_data:
        U: Optional[Tensor] = LU_data.triu()
        assert U is not None
        if m != k:
            U = U.narrow(-2, 0, k)
        L: Optional[Tensor] = LU_data.tril()
        assert L is not None
        if k != n:
            L = L.narrow(-1, 0, k)
        L.diagonal(dim1=-2, dim2=-1).fill_(1)
    else:
        L = U = None

    if unpack_pivots:
        LU_pivots_zero_idx = LU_pivots - 1
        if LU_data.dim() > 2:
            P: Optional[Tensor] = torch.eye(m, device=LU_data.device,
                                            dtype=LU_data.dtype) \
                .expand(shape[:-1] + (m,)) \
                .clone(memory_format=torch.contiguous_format)
            assert P is not None

            # TODO: rewrite when TorchScript supports product and map as
            # product(*map(lambda x: list(range(x)), shape[:-2])) when issue 33781 is fixed
            indices = _indices_product(shape[:-2])
            for idx in indices:
                final_order = [i for i in range(m)]  # noqa: C416 TODO: rewrite as list(range(m))
                for k, j in enumerate(_index_tensor_with_indices_list(LU_pivots_zero_idx, idx)):
                    final_order[k], final_order[j] = final_order[j], final_order[k]
                # TODO: remove _index_tensor_with_indices_list when TorchScript supports indexing Tensor with list
                p_idx = _index_tensor_with_indices_list(P, idx)
                p_idx.copy_(p_idx.index_select(1, torch.as_tensor(final_order, device=LU_pivots.device)))
        else:
            P = torch.eye(m, device=LU_data.device, dtype=LU_data.dtype)
            final_order = [i for i in range(m)]  # noqa: C416 TODO: rewrite as list(range(m))
            for k, j, in enumerate(LU_pivots_zero_idx):
                final_order[k], final_order[j] = final_order[j], final_order[k]
            P = P.index_select(1, torch.as_tensor(final_order, device=LU_pivots.device))
    else:
        P = None

    return P, L, U


def einsum(equation, *operands):
    r"""einsum(equation, *operands) -> Tensor

This function provides a way of computing multilinear expressions (i.e. sums of products) using the
Einstein summation convention.

Args:
    equation (string): The equation is given in terms of lower case letters (indices) to be associated
           with each dimension of the operands and result. The left hand side lists the operands
           dimensions, separated by commas. There should be one index letter per tensor dimension.
           The right hand side follows after `->` and gives the indices for the output.
           If the `->` and right hand side are omitted, it implicitly defined as the alphabetically
           sorted list of all indices appearing exactly once in the left hand side.
           The indices not apprearing in the output are summed over after multiplying the operands
           entries.
           If an index appears several times for the same operand, a diagonal is taken.
           Ellipses `...` represent a fixed number of dimensions. If the right hand side is inferred,
           the ellipsis dimensions are at the beginning of the output.
    operands (Tensor): The operands to compute the Einstein sum of.

.. note::

    This function does not optimize the given expression, so a different formula for the same computation may
    run faster or consume less memory. Projects like opt_einsum (https://optimized-einsum.readthedocs.io/en/stable/)
    can optimize the formula for you.

Examples::

    >>> x = torch.randn(5)
    >>> y = torch.randn(4)
    >>> torch.einsum('i,j->ij', x, y)  # outer product
    tensor([[-0.0570, -0.0286, -0.0231,  0.0197],
            [ 1.2616,  0.6335,  0.5113, -0.4351],
            [ 1.4452,  0.7257,  0.5857, -0.4984],
            [-0.4647, -0.2333, -0.1883,  0.1603],
            [-1.1130, -0.5588, -0.4510,  0.3838]])


    >>> A = torch.randn(3,5,4)
    >>> l = torch.randn(2,5)
    >>> r = torch.randn(2,4)
    >>> torch.einsum('bn,anm,bm->ba', l, A, r) # compare torch.nn.functional.bilinear
    tensor([[-0.3430, -5.2405,  0.4494],
            [ 0.3311,  5.5201, -3.0356]])


    >>> As = torch.randn(3,2,5)
    >>> Bs = torch.randn(3,5,4)
    >>> torch.einsum('bij,bjk->bik', As, Bs) # batch matrix multiplication
    tensor([[[-1.0564, -1.5904,  3.2023,  3.1271],
             [-1.6706, -0.8097, -0.8025, -2.1183]],

            [[ 4.2239,  0.3107, -0.5756, -0.2354],
             [-1.4558, -0.3460,  1.5087, -0.8530]],

            [[ 2.8153,  1.8787, -4.3839, -1.2112],
             [ 0.3728, -2.1131,  0.0921,  0.8305]]])

    >>> A = torch.randn(3, 3)
    >>> torch.einsum('ii->i', A) # diagonal
    tensor([-0.7825,  0.8291, -0.1936])

    >>> A = torch.randn(4, 3, 3)
    >>> torch.einsum('...ii->...i', A) # batch diagonal
    tensor([[-1.0864,  0.7292,  0.0569],
            [-0.9725, -1.0270,  0.6493],
            [ 0.5832, -1.1716, -1.5084],
            [ 0.4041, -1.1690,  0.8570]])

    >>> A = torch.randn(2, 3, 4, 5)
    >>> torch.einsum('...ij->...ji', A).shape # batch permute
    torch.Size([2, 3, 5, 4])
"""
    if not torch.jit.is_scripting():
        if any(type(t) is not Tensor for t in operands) and has_torch_function(operands):
            return handle_torch_function(einsum, operands, equation, *operands)
    if len(operands) == 1 and isinstance(operands[0], (list, tuple)):
        # the old interface of passing the operands as one list argument
        _operands = operands[0]
        # recurse incase operands contains value that has torch function
        # in the original implementation this line is omitted
        return einsum(equation, *_operands)

    return _VF.einsum(equation, operands)  # type: ignore


if TYPE_CHECKING:
    # The JIT doesn't understand Union, so only add type annotation for mypy
    def meshgrid(*tensors: Union[Tensor, List[Tensor]]) -> Tuple[Tensor, ...]:
        return _meshgrid(*tensors)
else:
    def meshgrid(*tensors):
        return _meshgrid(*tensors)


def _meshgrid(*tensors):
    r"""Take :math:`N` tensors, each of which can be either scalar or 1-dimensional
vector, and create :math:`N` N-dimensional grids, where the :math:`i` :sup:`th` grid is defined by
expanding the :math:`i` :sup:`th` input over dimensions defined by other inputs.


    Args:
        tensors (list of Tensor): list of scalars or 1 dimensional tensors. Scalars will be
        treated as tensors of size :math:`(1,)` automatically

    Returns:
        seq (sequence of Tensors): If the input has :math:`k` tensors of size
        :math:`(N_1,), (N_2,), \ldots , (N_k,)`, then the output would also have :math:`k` tensors,
        where all tensors are of size :math:`(N_1, N_2, \ldots , N_k)`.

    Example::

        >>> x = torch.tensor([1, 2, 3])
        >>> y = torch.tensor([4, 5, 6])
        >>> grid_x, grid_y = torch.meshgrid(x, y)
        >>> grid_x
        tensor([[1, 1, 1],
                [2, 2, 2],
                [3, 3, 3]])
        >>> grid_y
        tensor([[4, 5, 6],
                [4, 5, 6],
                [4, 5, 6]])
    """
    if not torch.jit.is_scripting():
        if any(type(t) is not Tensor for t in tensors) and has_torch_function(tensors):
            return handle_torch_function(meshgrid, tensors, *tensors)
    if len(tensors) == 1 and isinstance(tensors[0], (list, tuple)):
        # the old interface of passing the operands as one list argument
        tensors = tensors[0]  # type: ignore
    return _VF.meshgrid(tensors)  # type: ignore


def stft(input: Tensor, n_fft: int, hop_length: Optional[int] = None,
         win_length: Optional[int] = None, window: Optional[Tensor] = None,
         center: bool = True, pad_mode: str = 'reflect', normalized: bool = False,
         onesided: Optional[bool] = None,
         return_complex: Optional[bool] = None) -> Tensor:
    r"""Short-time Fourier transform (STFT).

    .. warning::
        Setting :attr:`return_complex` explicitly will be required in a future
        PyTorch release. Set it to False to preserve the current behavior or
        True to return a complex output.

    The STFT computes the Fourier transform of short overlapping windows of the
    input. This giving frequency components of the signal as they change over
    time. The interface of this function is modeled after the librosa_ stft function.

    .. _librosa: https://librosa.org/doc/latest/generated/librosa.stft.html

    Ignoring the optional batch dimension, this method computes the following
    expression:

    .. math::
        X[m, \omega] = \sum_{k = 0}^{\text{win\_length-1}}%
                            \text{window}[k]\ \text{input}[m \times \text{hop\_length} + k]\ %
                            \exp\left(- j \frac{2 \pi \cdot \omega k}{\text{win\_length}}\right),

    where :math:`m` is the index of the sliding window, and :math:`\omega` is
    the frequency that :math:`0 \leq \omega < \text{n\_fft}`. When
    :attr:`onesided` is the default value ``True``,

    * :attr:`input` must be either a 1-D time sequence or a 2-D batch of time
      sequences.

    * If :attr:`hop_length` is ``None`` (default), it is treated as equal to
      ``floor(n_fft / 4)``.

    * If :attr:`win_length` is ``None`` (default), it is treated as equal to
      :attr:`n_fft`.

    * :attr:`window` can be a 1-D tensor of size :attr:`win_length`, e.g., from
      :meth:`torch.hann_window`. If :attr:`window` is ``None`` (default), it is
      treated as if having :math:`1` everywhere in the window. If
      :math:`\text{win\_length} < \text{n\_fft}`, :attr:`window` will be padded on
      both sides to length :attr:`n_fft` before being applied.

    * If :attr:`center` is ``True`` (default), :attr:`input` will be padded on
      both sides so that the :math:`t`-th frame is centered at time
      :math:`t \times \text{hop\_length}`. Otherwise, the :math:`t`-th frame
      begins at time  :math:`t \times \text{hop\_length}`.

    * :attr:`pad_mode` determines the padding method used on :attr:`input` when
      :attr:`center` is ``True``. See :meth:`torch.nn.functional.pad` for
      all available options. Default is ``"reflect"``.

    * If :attr:`onesided` is ``True`` (default for real input), only values for
      :math:`\omega` in :math:`\left[0, 1, 2, \dots, \left\lfloor
      \frac{\text{n\_fft}}{2} \right\rfloor + 1\right]` are returned because
      the real-to-complex Fourier transform satisfies the conjugate symmetry,
      i.e., :math:`X[m, \omega] = X[m, \text{n\_fft} - \omega]^*`.
      Note if the input or window tensors are complex, then :attr:`onesided`
      output is not possible.

    * If :attr:`normalized` is ``True`` (default is ``False``), the function
      returns the normalized STFT results, i.e., multiplied by :math:`(\text{frame\_length})^{-0.5}`.

    * If :attr:`return_complex` is ``True`` (default if input is complex), the
      return is a ``input.dim() + 1`` dimensional complex tensor. If ``False``,
      the output is a ``input.dim() + 2`` dimensional real tensor where the last
      dimension represents the real and imaginary components.

    Returns either a complex tensor of size :math:`(* \times N \times T)` if
    :attr:`return_complex` is true, or a real tensor of size :math:`(* \times N
    \times T \times 2)`. Where :math:`*` is the optional batch size of
    :attr:`input`, :math:`N` is the number of frequencies where STFT is applied
    and :math:`T` is the total number of frames used.

    .. warning::
      This function changed signature at version 0.4.1. Calling with the
      previous signature may cause error or return incorrect result.

    Arguments:
        input (Tensor): the input tensor
        n_fft (int): size of Fourier transform
        hop_length (int, optional): the distance between neighboring sliding window
            frames. Default: ``None`` (treated as equal to ``floor(n_fft / 4)``)
        win_length (int, optional): the size of window frame and STFT filter.
            Default: ``None``  (treated as equal to :attr:`n_fft`)
        window (Tensor, optional): the optional window function.
            Default: ``None`` (treated as window of all :math:`1` s)
        center (bool, optional): whether to pad :attr:`input` on both sides so
            that the :math:`t`-th frame is centered at time :math:`t \times \text{hop\_length}`.
            Default: ``True``
        pad_mode (string, optional): controls the padding method used when
            :attr:`center` is ``True``. Default: ``"reflect"``
        normalized (bool, optional): controls whether to return the normalized STFT results
             Default: ``False``
        onesided (bool, optional): controls whether to return half of results to
            avoid redundancy for real inputs.
            Default: ``True`` for real :attr:`input` and :attr:`window`, ``False`` otherwise.
        return_complex (bool, optional): whether to return a complex tensor, or
            a real tensor with an extra last dimension for the real and
            imaginary components.

    Returns:
        Tensor: A tensor containing the STFT result with shape described above

    """
    if not torch.jit.is_scripting():
        if type(input) is not Tensor and has_torch_function((input,)):
            return handle_torch_function(
                stft, (input,), input, n_fft, hop_length=hop_length, win_length=win_length,
                window=window, center=center, pad_mode=pad_mode, normalized=normalized,
                onesided=onesided, return_complex=return_complex)
    # TODO: after having proper ways to map Python strings to ATen Enum, move
    #       this and F.pad to ATen.
    if center:
        signal_dim = input.dim()
        extended_shape = [1] * (3 - signal_dim) + list(input.size())
        pad = int(n_fft // 2)
        input = F.pad(input.view(extended_shape), (pad, pad), pad_mode)
        input = input.view(input.shape[-signal_dim:])
    return _VF.stft(input, n_fft, hop_length, win_length, window,  # type: ignore
                    normalized, onesided, return_complex)

def istft(input: Tensor, n_fft: int, hop_length: Optional[int] = None,
          win_length: Optional[int] = None, window: Optional[Tensor] = None,
          center: bool = True, normalized: bool = False,
          onesided: Optional[bool] = None, length: Optional[int] = None,
          return_complex: bool = False) -> Tensor:
    r"""Inverse short time Fourier Transform. This is expected to be the inverse of :func:`~torch.stft`.
    It has the same parameters (+ additional optional parameter of :attr:`length`) and it should return the
    least squares estimation of the original signal. The algorithm will check using the NOLA condition (
    nonzero overlap).

    Important consideration in the parameters :attr:`window` and :attr:`center` so that the envelop
    created by the summation of all the windows is never zero at certain point in time. Specifically,
    :math:`\sum_{t=-\infty}^{\infty} |w|^2[n-t\times hop\_length] \cancel{=} 0`.

    Since :func:`~torch.stft` discards elements at the end of the signal if they do not fit in a frame,
    ``istft`` may return a shorter signal than the original signal (can occur if :attr:`center` is False
    since the signal isn't padded).

    If :attr:`center` is ``True``, then there will be padding e.g. ``'constant'``, ``'reflect'``, etc.
    Left padding can be trimmed off exactly because they can be calculated but right padding cannot be
    calculated without additional information.

    Example: Suppose the last window is:
    ``[17, 18, 0, 0, 0]`` vs ``[18, 0, 0, 0, 0]``

    The :attr:`n_fft`, :attr:`hop_length`, :attr:`win_length` are all the same which prevents the calculation
    of right padding. These additional values could be zeros or a reflection of the signal so providing
    :attr:`length` could be useful. If :attr:`length` is ``None`` then padding will be aggressively removed
    (some loss of signal).

    [1] D. W. Griffin and J. S. Lim, "Signal estimation from modified short-time Fourier transform,"
    IEEE Trans. ASSP, vol.32, no.2, pp.236-243, Apr. 1984.

    Arguments:
        input (Tensor): The input tensor. Expected to be output of :func:`~torch.stft`,
            can either be complex (``channel``, ``fft_size``, ``n_frame``), or real
            (``channel``, ``fft_size``, ``n_frame``, 2) where the ``channel``
            dimension is optional.
        n_fft (int): Size of Fourier transform
        hop_length (Optional[int]): The distance between neighboring sliding window frames.
            (Default: ``n_fft // 4``)
        win_length (Optional[int]): The size of window frame and STFT filter. (Default: ``n_fft``)
        window (Optional[torch.Tensor]): The optional window function.
            (Default: ``torch.ones(win_length)``)
        center (bool): Whether :attr:`input` was padded on both sides so that the :math:`t`-th frame is
            centered at time :math:`t \times \text{hop\_length}`.
            (Default: ``True``)
        normalized (bool): Whether the STFT was normalized. (Default: ``False``)
        onesided (Optional[bool]): Whether the STFT was onesided.
            (Default: ``True`` if ``n_fft != fft_size`` in the input size)
        length (Optional[int]): The amount to trim the signal by (i.e. the
            original signal length). (Default: whole signal)
        return_complex (Optional[bool]):
            Whether the output should be complex, or if the input should be
            assumed to derive from a real signal and window.
            Note that this is incompatible with ``onesided=True``.
            (Default: ``False``)

    Returns:
        Tensor: Least squares estimation of the original signal of size (..., signal_length)
    """
    if not torch.jit.is_scripting():
        if type(input) is not Tensor and has_torch_function((input,)):
            return handle_torch_function(
                istft, (input,), input, n_fft, hop_length=hop_length, win_length=win_length,
                window=window, center=center, normalized=normalized, onesided=onesided,
                length=length, return_complex=return_complex)

    return _VF.istft(input, n_fft, hop_length, win_length, window, center,  # type: ignore
                     normalized, onesided, length, return_complex)


del torch.unique_dim


if TYPE_CHECKING:
    # These _impl functions return a variable number of tensors as output with
    # __torch_function__; tuple unpacking is done already rather than being
    # done by the caller of the _impl function
    _unique_impl_out = Any
else:
    _unique_impl_out = Tuple[Tensor, Tensor, Tensor]


def _unique_impl(input: Tensor, sorted: bool = True,
                 return_inverse: bool = False, return_counts: bool = False,
                 dim: Optional[int] = None) -> _unique_impl_out:
    r"""Returns the unique elements of the input tensor.

    .. note:: This function is different from :func:`torch.unique_consecutive` in the sense that
        this function also eliminates non-consecutive duplicate values.

    .. note:: Currently in the CUDA implementation and the CPU implementation when dim is specified,
        `torch.unique` always sort the tensor at the beginning regardless of the `sort` argument.
        Sorting could be slow, so if your input tensor is already sorted, it is recommended to use
        :func:`torch.unique_consecutive` which avoids the sorting.

    Arguments:
        input (Tensor): the input tensor
        sorted (bool): Whether to sort the unique elements in ascending order
            before returning as output.
        return_inverse (bool): Whether to also return the indices for where
            elements in the original input ended up in the returned unique list.
        return_counts (bool): Whether to also return the counts for each unique
            element.
        dim (int): the dimension to apply unique. If ``None``, the unique of the
            flattened input is returned. default: ``None``

    Returns:
        (Tensor, Tensor (optional), Tensor (optional)): A tensor or a tuple of tensors containing

            - **output** (*Tensor*): the output list of unique scalar elements.
            - **inverse_indices** (*Tensor*): (optional) if
              :attr:`return_inverse` is True, there will be an additional
              returned tensor (same shape as input) representing the indices
              for where elements in the original input map to in the output;
              otherwise, this function will only return a single tensor.
            - **counts** (*Tensor*): (optional) if
              :attr:`return_counts` is True, there will be an additional
              returned tensor (same shape as output or output.size(dim),
              if dim was specified) representing the number of occurrences
              for each unique value or tensor.

    Example::

        >>> output = torch.unique(torch.tensor([1, 3, 2, 3], dtype=torch.long))
        >>> output
        tensor([ 2,  3,  1])

        >>> output, inverse_indices = torch.unique(
                torch.tensor([1, 3, 2, 3], dtype=torch.long), sorted=True, return_inverse=True)
        >>> output
        tensor([ 1,  2,  3])
        >>> inverse_indices
        tensor([ 0,  2,  1,  2])

        >>> output, inverse_indices = torch.unique(
                torch.tensor([[1, 3], [2, 3]], dtype=torch.long), sorted=True, return_inverse=True)
        >>> output
        tensor([ 1,  2,  3])
        >>> inverse_indices
        tensor([[ 0,  2],
                [ 1,  2]])

    """
    if not torch.jit.is_scripting():
        if type(input) is not Tensor and has_torch_function((input,)):
            return handle_torch_function(
                unique, (input,), input, sorted=sorted, return_inverse=return_inverse,
                return_counts=return_counts, dim=dim)

    if dim is not None:
        output, inverse_indices, counts = _VF.unique_dim(  # type: ignore
            input,
            dim,
            sorted=sorted,
            return_inverse=return_inverse,
            return_counts=return_counts,
        )
    else:
        output, inverse_indices, counts = torch._unique2(
            input,
            sorted=sorted,
            return_inverse=return_inverse,
            return_counts=return_counts,
        )
    return output, inverse_indices, counts


def _unique_consecutive_impl(input: Tensor, return_inverse: bool = False,
                             return_counts: bool = False,
                             dim: Optional[int] = None) -> _unique_impl_out:
    r"""Eliminates all but the first element from every consecutive group of equivalent elements.

    .. note:: This function is different from :func:`torch.unique` in the sense that this function
        only eliminates consecutive duplicate values. This semantics is similar to `std::unique`
        in C++.

    Arguments:
        input (Tensor): the input tensor
        return_inverse (bool): Whether to also return the indices for where
            elements in the original input ended up in the returned unique list.
        return_counts (bool): Whether to also return the counts for each unique
            element.
        dim (int): the dimension to apply unique. If ``None``, the unique of the
            flattened input is returned. default: ``None``

    Returns:
        (Tensor, Tensor (optional), Tensor (optional)): A tensor or a tuple of tensors containing

            - **output** (*Tensor*): the output list of unique scalar elements.
            - **inverse_indices** (*Tensor*): (optional) if
              :attr:`return_inverse` is True, there will be an additional
              returned tensor (same shape as input) representing the indices
              for where elements in the original input map to in the output;
              otherwise, this function will only return a single tensor.
            - **counts** (*Tensor*): (optional) if
              :attr:`return_counts` is True, there will be an additional
              returned tensor (same shape as output or output.size(dim),
              if dim was specified) representing the number of occurrences
              for each unique value or tensor.

    Example::

        >>> x = torch.tensor([1, 1, 2, 2, 3, 1, 1, 2])
        >>> output = torch.unique_consecutive(x)
        >>> output
        tensor([1, 2, 3, 1, 2])

        >>> output, inverse_indices = torch.unique_consecutive(x, return_inverse=True)
        >>> output
        tensor([1, 2, 3, 1, 2])
        >>> inverse_indices
        tensor([0, 0, 1, 1, 2, 3, 3, 4])

        >>> output, counts = torch.unique_consecutive(x, return_counts=True)
        >>> output
        tensor([1, 2, 3, 1, 2])
        >>> counts
        tensor([2, 2, 1, 2, 1])
    """
    if not torch.jit.is_scripting():
        if type(input) is not Tensor and has_torch_function((input,)):
            return handle_torch_function(
                unique_consecutive, (input,), input, return_inverse=return_inverse,
                return_counts=return_counts, dim=dim)
    output, inverse_indices, counts = _VF.unique_consecutive(  # type: ignore
        input, return_inverse=return_inverse, return_counts=return_counts, dim=dim)
    return output, inverse_indices, counts


def _return_counts(input, sorted=True, return_inverse=False, return_counts=False, dim=None):
    # type: (Tensor, bool, bool, bool, Optional[int]) -> Tuple[Tensor, Tensor]

    if not torch.jit.is_scripting():
        if type(input) is not Tensor and has_torch_function((input,)):
            return _unique_impl(input, sorted, return_inverse, return_counts, dim)

    output, _, counts = _unique_impl(input, sorted, return_inverse, return_counts, dim)
    return output, counts


def _return_output(input, sorted=True, return_inverse=False, return_counts=False, dim=None):
    # type: (Tensor, bool, bool, bool, Optional[int]) -> Tensor

    if not torch.jit.is_scripting():
        if type(input) is not Tensor and has_torch_function((input,)):
            return _unique_impl(input, sorted, return_inverse, return_counts, dim)

    output, _, _ = _unique_impl(input, sorted, return_inverse, return_counts, dim)
    return output


def _return_inverse(input, sorted=True, return_inverse=False, return_counts=False, dim=None):
    # type: (Tensor, bool, bool, bool, Optional[int]) -> Tuple[Tensor, Tensor]

    if not torch.jit.is_scripting():
        if type(input) is not Tensor and has_torch_function((input,)):
            return _unique_impl(input, sorted, return_inverse, return_counts, dim)

    output, inverse_indices, _ = _unique_impl(input, sorted, return_inverse, return_counts, dim)
    return output, inverse_indices


_return_inverse_false = boolean_dispatch(
    arg_name='return_counts',
    arg_index=3,
    default=False,
    if_true=_return_counts,
    if_false=_return_output,
    module_name=__name__,
    func_name='unique')

_return_inverse_true = boolean_dispatch(
    arg_name='return_counts',
    arg_index=3,
    default=False,
    if_true=_unique_impl,
    if_false=_return_inverse,
    module_name=__name__,
    func_name='unique')

# The return type of unique depends on `return_inverse`, and `return_counts` so in order to
# resolve the output type in TorchScript we need to statically know the value of both parameters

unique = boolean_dispatch(
    arg_name='return_inverse',
    arg_index=2,
    default=False,
    if_true=_return_inverse_true,
    if_false=_return_inverse_false,
    module_name=__name__,
    func_name='unique')
unique.__doc__ = _unique_impl.__doc__


def _consecutive_return_counts(input, return_inverse=False, return_counts=False, dim=None):
    # type: (Tensor, bool, bool, Optional[int]) -> Tuple[Tensor, Tensor]

    if not torch.jit.is_scripting():
        if type(input) is not Tensor and has_torch_function((input,)):
            return _unique_consecutive_impl(input, return_inverse, return_counts, dim)

    output, _, counts = _unique_consecutive_impl(input, return_inverse, return_counts, dim)
    return output, counts


def _consecutive_return_output(input, return_inverse=False, return_counts=False, dim=None):
    # type: (Tensor, bool, bool, Optional[int]) -> Tensor

    if not torch.jit.is_scripting():
        if type(input) is not Tensor and has_torch_function((input,)):
            return _unique_consecutive_impl(input, return_inverse, return_counts, dim)

    output, _, _ = _unique_consecutive_impl(input, return_inverse, return_counts, dim)
    return output


def _consecutive_return_inverse(input, return_inverse=False, return_counts=False, dim=None):
    # type: (Tensor, bool, bool, Optional[int]) -> Tuple[Tensor, Tensor]

    if not torch.jit.is_scripting():
        if type(input) is not Tensor and has_torch_function((input,)):
            return _unique_consecutive_impl(input, return_inverse, return_counts, dim)

    output, inverse_indices, _ = _unique_consecutive_impl(input, return_inverse, return_counts, dim)
    return output, inverse_indices


_consecutive_return_inverse_false = boolean_dispatch(
    arg_name='return_counts',
    arg_index=1,
    default=False,
    if_true=_consecutive_return_counts,
    if_false=_consecutive_return_output,
    module_name=__name__,
    func_name='unique_consecutive')

_consecutive_return_inverse_true = boolean_dispatch(
    arg_name='return_counts',
    arg_index=1,
    default=False,
    if_true=_unique_consecutive_impl,
    if_false=_consecutive_return_inverse,
    module_name=__name__,
    func_name='unique_consecutive')

# The return type of unique depends on `return_inverse`, and `return_counts` so in order to
# resolve the output type in TorchScript we need to statically know the value of both parameters

unique_consecutive = boolean_dispatch(
    arg_name='return_inverse',
    arg_index=2,
    default=False,
    if_true=_consecutive_return_inverse_true,
    if_false=_consecutive_return_inverse_false,
    module_name=__name__,
    func_name='unique_consecutive')
unique_consecutive.__doc__ = _unique_consecutive_impl.__doc__


def tensordot(a, b, dims=2):
    r"""Returns a contraction of a and b over multiple dimensions.

    :attr:`tensordot` implements a generalized matrix product.

    Args:
      a (Tensor): Left tensor to contract
      b (Tensor): Right tensor to contract
      dims (int or tuple of two lists of integers): number of dimensions to
         contract or explicit lists of dimensions for :attr:`a` and
         :attr:`b` respectively

    When called with a non-negative integer argument :attr:`dims` = :math:`d`, and
    the number of dimensions of :attr:`a` and :attr:`b` is :math:`m` and :math:`n`,
    respectively, :func:`~torch.tensordot` computes

    .. math::
        r_{i_0,...,i_{m-d}, i_d,...,i_n}
          = \sum_{k_0,...,k_{d-1}} a_{i_0,...,i_{m-d},k_0,...,k_{d-1}} \times b_{k_0,...,k_{d-1}, i_d,...,i_n}.

    When called with :attr:`dims` of the list form, the given dimensions will be contracted
    in place of the last :math:`d` of :attr:`a` and the first :math:`d` of :math:`b`. The sizes
    in these dimensions must match, but :func:`~torch.tensordot` will deal with broadcasted
    dimensions.

    Examples::

        >>> a = torch.arange(60.).reshape(3, 4, 5)
        >>> b = torch.arange(24.).reshape(4, 3, 2)
        >>> torch.tensordot(a, b, dims=([1, 0], [0, 1]))
        tensor([[4400., 4730.],
                [4532., 4874.],
                [4664., 5018.],
                [4796., 5162.],
                [4928., 5306.]])

        >>> a = torch.randn(3, 4, 5, device='cuda')
        >>> b = torch.randn(4, 5, 6, device='cuda')
        >>> c = torch.tensordot(a, b, dims=2).cpu()
        tensor([[ 8.3504, -2.5436,  6.2922,  2.7556, -1.0732,  3.2741],
                [ 3.3161,  0.0704,  5.0187, -0.4079, -4.3126,  4.8744],
                [ 0.8223,  3.9445,  3.2168, -0.2400,  3.4117,  1.7780]])

    """
    if not torch.jit.is_scripting():
        if (type(a) is not Tensor or type(b) is not Tensor) and has_torch_function((a, b)):
            return handle_torch_function(tensordot, (a, b), a, b, dims=dims)
    if isinstance(dims, (list, tuple)) or \
       (isinstance(dims, torch.Tensor) and dims.numel() > 1):
        dims_a, dims_b = dims
    else:
        if isinstance(dims, torch.Tensor):
            dims = dims.item()
        if dims < 0:
            raise RuntimeError(f"tensordot expects dims >= 0, but got dims={dims}")
        dims_a = list(range(-dims, 0))
        dims_b = list(range(dims))
    return _VF.tensordot(a, b, dims_a, dims_b)  # type: ignore

def cartesian_prod(*tensors):
    """Do cartesian product of the given sequence of tensors. The behavior is similar to
    python's `itertools.product`.

    Arguments:
        *tensors: any number of 1 dimensional tensors.

    Returns:
        Tensor: A tensor equivalent to converting all the input tensors into lists,
            do `itertools.product` on these lists, and finally convert the resulting list
            into tensor.

    Example::

        >>> a = [1, 2, 3]
        >>> b = [4, 5]
        >>> list(itertools.product(a, b))
        [(1, 4), (1, 5), (2, 4), (2, 5), (3, 4), (3, 5)]
        >>> tensor_a = torch.tensor(a)
        >>> tensor_b = torch.tensor(b)
        >>> torch.cartesian_prod(tensor_a, tensor_b)
        tensor([[1, 4],
                [1, 5],
                [2, 4],
                [2, 5],
                [3, 4],
                [3, 5]])
    """
    if not torch.jit.is_scripting():
        if any(type(t) is not Tensor for t in tensors) and has_torch_function(tensors):
            return handle_torch_function(cartesian_prod, tensors, *tensors)
    return _VF.cartesian_prod(tensors)  # type: ignore

def block_diag(*tensors):
    """Create a block diagonal matrix from provided tensors.

    Arguments:
        *tensors: One or more tensors with 0, 1, or 2 dimensions.

    Returns:
        Tensor: A 2 dimensional tensor with all the input tensors arranged in
            order such that their upper left and lower right corners are
            diagonally adjacent. All other elements are set to 0.

    Example::

        >>> import torch
        >>> A = torch.tensor([[0, 1], [1, 0]])
        >>> B = torch.tensor([[3, 4, 5], [6, 7, 8]])
        >>> C = torch.tensor(7)
        >>> D = torch.tensor([1, 2, 3])
        >>> E = torch.tensor([[4], [5], [6]])
        >>> torch.block_diag(A, B, C, D, E)
        tensor([[0, 1, 0, 0, 0, 0, 0, 0, 0, 0],
                [1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                [0, 0, 3, 4, 5, 0, 0, 0, 0, 0],
                [0, 0, 6, 7, 8, 0, 0, 0, 0, 0],
                [0, 0, 0, 0, 0, 7, 0, 0, 0, 0],
                [0, 0, 0, 0, 0, 0, 1, 2, 3, 0],
                [0, 0, 0, 0, 0, 0, 0, 0, 0, 4],
                [0, 0, 0, 0, 0, 0, 0, 0, 0, 5],
                [0, 0, 0, 0, 0, 0, 0, 0, 0, 6]])
    """
    if any(type(t) is not Tensor for t in tensors) and has_torch_function(tensors):
        return handle_torch_function(block_diag, tensors, *tensors)
    return torch._C._VariableFunctions.block_diag(tensors)  # type: ignore


def cdist(x1, x2, p=2., compute_mode='use_mm_for_euclid_dist_if_necessary'):
    # type: (Tensor, Tensor, float, str) -> (Tensor)
    r"""Computes batched the p-norm distance between each pair of the two collections of row vectors.

    Args:
        x1 (Tensor): input tensor of shape :math:`B \times P \times M`.
        x2 (Tensor): input tensor of shape :math:`B \times R \times M`.
        p: p value for the p-norm distance to calculate between each vector pair
            :math:`\in [0, \infty]`.
        compute_mode:
            'use_mm_for_euclid_dist_if_necessary' - will use matrix multiplication approach to calculate
            euclidean distance (p = 2) if P > 25 or R > 25
            'use_mm_for_euclid_dist' - will always use matrix multiplication approach to calculate
            euclidean distance (p = 2)
            'donot_use_mm_for_euclid_dist' - will never use matrix multiplication approach to calculate
            euclidean distance (p = 2)
            Default: use_mm_for_euclid_dist_if_necessary.

    If x1 has shape :math:`B \times P \times M` and x2 has shape :math:`B \times R \times M` then the
    output will have shape :math:`B \times P \times R`.

    This function is equivalent to `scipy.spatial.distance.cdist(input,'minkowski', p=p)`
    if :math:`p \in (0, \infty)`. When :math:`p = 0` it is equivalent to
    `scipy.spatial.distance.cdist(input, 'hamming') * M`. When :math:`p = \infty`, the closest
    scipy function is `scipy.spatial.distance.cdist(xn, lambda x, y: np.abs(x - y).max())`.

    Example:

        >>> a = torch.tensor([[0.9041,  0.0196], [-0.3108, -2.4423], [-0.4821,  1.059]])
        >>> a
        tensor([[ 0.9041,  0.0196],
                [-0.3108, -2.4423],
                [-0.4821,  1.0590]])
        >>> b = torch.tensor([[-2.1763, -0.4713], [-0.6986,  1.3702]])
        >>> b
        tensor([[-2.1763, -0.4713],
                [-0.6986,  1.3702]])
        >>> torch.cdist(a, b, p=2)
        tensor([[3.1193, 2.0959],
                [2.7138, 3.8322],
                [2.2830, 0.3791]])
    """
    if not torch.jit.is_scripting():
        if (type(x1) is not Tensor or type(x2) is not Tensor) and has_torch_function((x1, x2)):
            return handle_torch_function(
                cdist, (x1, x2), x1, x2, p=p, compute_mode=compute_mode)
    if compute_mode == 'use_mm_for_euclid_dist_if_necessary':
        return _VF.cdist(x1, x2, p, None)  # type: ignore
    elif compute_mode == 'use_mm_for_euclid_dist':
        return _VF.cdist(x1, x2, p, 1)  # type: ignore
    elif compute_mode == 'donot_use_mm_for_euclid_dist':
        return _VF.cdist(x1, x2, p, 2)  # type: ignore
    else:
        raise ValueError(f"{compute_mode} is not a valid value for compute_mode")

def atleast_1d(*tensors):
    r"""
    Returns a 1-dimensional view of each input tensor with zero dimensions.
    Input tensors with one or more dimensions are returned as-is.

    Args:
        input (Tensor or list of Tensors)

    Returns:
        output (Tensor or tuple of Tensors)

    Example::
        >>> x = torch.randn(2)
        >>> x
        tensor([1.4584, 0.7583])
        >>> torch.atleast_1d(x)
        tensor([1.4584, 0.7583])
        >>> x = torch.tensor(1.)
        >>> x
        tensor(1.)
        >>> torch.atleast_1d(x)
        tensor([1.])
        >>> x = torch.tensor(0.5)
        >>> y = torch.tensor(1.)
        >>> torch.atleast_1d((x,y))
        (tensor([0.5000]), tensor([1.]))
    """
    if not torch.jit.is_scripting():
        if any(type(t) is not Tensor for t in tensors) and has_torch_function(tensors):
            return handle_torch_function(atleast_1d, tensors, *tensors)
    if len(tensors) == 1:
        tensors = tensors[0]
    return _VF.atleast_1d(tensors)  # type: ignore

def atleast_2d(*tensors):
    r"""
    Returns a 2-dimensional view of each each input tensor with zero dimensions.
    Input tensors with two or more dimensions are returned as-is.
    Args:
        input (Tensor or list of Tensors)

    Returns:
        output (Tensor or tuple of Tensors)

    Example::
        >>> x = torch.tensor(1.)
        >>> x
        tensor(1.)
        >>> torch.atleast_2d(x)
        tensor([[1.]])
        >>> x = torch.randn(2,2)
        >>> x
        tensor([[2.2086, 2.5165],
                [0.1757, 0.5194]])
        >>> torch.atleast_2d(x)
        tensor([[2.2086, 2.5165],
                [0.1757, 0.5194]])
        >>> x = torch.tensor(0.5)
        >>> y = torch.tensor(1.)
        >>> torch.atleast_2d((x,y))
        (tensor([[0.5000]]), tensor([[1.]]))
    """
    if not torch.jit.is_scripting():
        if any(type(t) is not Tensor for t in tensors) and has_torch_function(tensors):
            return handle_torch_function(atleast_2d, tensors, *tensors)
    if len(tensors) == 1:
        tensors = tensors[0]
    return _VF.atleast_2d(tensors)  # type: ignore

def atleast_3d(*tensors):
    r"""
    Returns a 3-dimensional view of each each input tensor with zero dimensions.
    Input tensors with three or more dimensions are returned as-is.
    Args:
        input (Tensor or list of Tensors)

    Returns:
        output (Tensor or tuple of Tensors)

    Example:

        >>> x = torch.tensor(0.5)
        >>> x
        tensor(0.5000)
        >>> torch.atleast_3d(x)
        tensor([[[0.5000]]])
        >>> y = torch.randn(2,2)
        >>> y
        tensor([[-0.8079,  0.7460],
                [-1.1647,  1.4734]])
        >>> torch.atleast_3d(y)
        tensor([[[-0.8079],
                [ 0.7460]],
                <BLANKLINE>
                [[-1.1647],
                [ 1.4734]]])
        >>> x = torch.randn(1,1,1)
        >>> x
        tensor([[[-1.5689]]])
        >>> torch.atleast_3d(x)
        tensor([[[-1.5689]]])
        >>> x = torch.tensor(0.5)
        >>> y = torch.tensor(1.)
        >>> torch.atleast_3d((x,y))
        (tensor([[[0.5000]]]), tensor([[[1.]]]))
    """
    if not torch.jit.is_scripting():
        if any(type(t) is not Tensor for t in tensors) and has_torch_function(tensors):
            return handle_torch_function(atleast_3d, tensors, *tensors)
    if len(tensors) == 1:
        tensors = tensors[0]
    return _VF.atleast_3d(tensors)  # type: ignore


if TYPE_CHECKING:
    pass
    # There's no good way to use this type annotation; cannot rename norm() to
    # _norm_impl() in a way that doesn't break JIT overloads. So leave untyped
    # for mypy for now.
    #    def norm(input: Tensor,
    #             p: Optional[Union[str, Number]] = "fro",
    #             dim: Optional[Union[int, List[int]]] = None,
    #             keepdim: bool = False,
    #             out: Optional[Tensor] = None,
    #             dtype: _dtype = None) -> Tensor:
    #        return _norm_impl(input, p, dim, keepdim, out, dtype)
else:
    # TODO: type dim as BroadcastingList when
    # https://github.com/pytorch/pytorch/issues/33782 is fixed
    @overload  # noqa: 749
    def norm(input, p="fro", dim=None, keepdim=False, out=None, dtype=None):  # noqa: 749
        # type: (Tensor, str, Optional[List[int]], bool, Optional[Tensor], Optional[int]) -> Tensor
        pass

    @overload  # noqa: 749
    def norm(input, p="fro", dim=None, keepdim=False, out=None, dtype=None):  # noqa: 749
        # type: (Tensor, Optional[number], Optional[List[int]], bool, Optional[Tensor], Optional[int]) -> Tensor
        pass

    @overload  # noqa: 749
    def norm(input, p="fro", dim=None, keepdim=False, out=None, dtype=None):  # noqa: 749
        # type: (Tensor, Optional[number], Optional[int], bool, Optional[Tensor], Optional[int]) -> Tensor
        pass

    @overload  # noqa: 749
    def norm(input, p="fro", dim=None, keepdim=False, out=None, dtype=None):  # noqa: 749
        # type: (Tensor, str, Optional[int], bool, Optional[Tensor], Optional[int]) -> Tensor
        pass


def norm(input, p="fro", dim=None, keepdim=False, out=None, dtype=None):  # noqa: 749
    r"""Returns the matrix norm or vector norm of a given tensor.

    .. warning::

        torch.norm is deprecated and may be removed in a future PyTorch release.
        Use :func:`torch.linalg.norm` instead, but note that :func:`torch.linalg.norm`
        has a different signature and slightly different behavior that is
        more consistent with NumPy's numpy.linalg.norm.

    Args:
        input (Tensor): the input tensor
        p (int, float, inf, -inf, 'fro', 'nuc', optional): the order of norm. Default: ``'fro'``
            The following norms can be calculated:

            =====  ============================  ==========================
            ord    matrix norm                   vector norm
            =====  ============================  ==========================
            None   Frobenius norm                2-norm
            'fro'  Frobenius norm                --
            'nuc'  nuclear norm                  --
            Other  as vec norm when dim is None  sum(abs(x)**ord)**(1./ord)
            =====  ============================  ==========================

        dim (int, 2-tuple of ints, 2-list of ints, optional): If it is an int,
            vector norm will be calculated, if it is 2-tuple of ints, matrix norm
            will be calculated. If the value is None, matrix norm will be calculated
            when the input tensor only has two dimensions, vector norm will be
            calculated when the input tensor only has one dimension. If the input
            tensor has more than two dimensions, the vector norm will be applied to
            last dimension.
        keepdim (bool, optional): whether the output tensors have :attr:`dim`
            retained or not. Ignored if :attr:`dim` = ``None`` and
            :attr:`out` = ``None``. Default: ``False``
        out (Tensor, optional): the output tensor. Ignored if
            :attr:`dim` = ``None`` and :attr:`out` = ``None``.
        dtype (:class:`torch.dtype`, optional): the desired data type of
            returned tensor. If specified, the input tensor is casted to
            :attr:'dtype' while performing the operation. Default: None.


    Example::

        >>> import torch
        >>> a = torch.arange(9, dtype= torch.float) - 4
        >>> b = a.reshape((3, 3))
        >>> torch.norm(a)
        tensor(7.7460)
        >>> torch.norm(b)
        tensor(7.7460)
        >>> torch.norm(a, float('inf'))
        tensor(4.)
        >>> torch.norm(b, float('inf'))
        tensor(4.)
        >>> c = torch.tensor([[ 1, 2, 3],[-1, 1, 4]] , dtype= torch.float)
        >>> torch.norm(c, dim=0)
        tensor([1.4142, 2.2361, 5.0000])
        >>> torch.norm(c, dim=1)
        tensor([3.7417, 4.2426])
        >>> torch.norm(c, p=1, dim=1)
        tensor([6., 6.])
        >>> d = torch.arange(8, dtype= torch.float).reshape(2,2,2)
        >>> torch.norm(d, dim=(1,2))
        tensor([ 3.7417, 11.2250])
        >>> torch.norm(d[0, :, :]), torch.norm(d[1, :, :])
        (tensor(3.7417), tensor(11.2250))
    """

    if not torch.jit.is_scripting():
        if type(input) is not Tensor and has_torch_function((input,)):
            return handle_torch_function(
                norm, (input,), input, p=p, dim=dim, keepdim=keepdim, out=out, dtype=dtype)

    ndim = input.dim()

    # catch default case
    if dim is None and out is None and dtype is None and p is not None:
        if isinstance(p, str):
            if p == "fro":
                return _VF.frobenius_norm(input, dim=(), keepdim=keepdim)  # type: ignore
        if not isinstance(p, str):
            _dim = [i for i in range(ndim)]  # noqa: C416 TODO: rewrite as list(range(m))
            return _VF.norm(input, p, dim=_dim, keepdim=keepdim)  # type: ignore

    # TODO: when https://github.com/pytorch/pytorch/issues/33782 is fixed
    # remove the overloads where dim is an int and replace with BraodcastingList1
    # and remove next four lines, replace _dim with dim
    if dim is not None:
        if isinstance(dim, int):
            _dim = [dim]
        else:
            _dim = dim
    else:
        _dim = None  # type: ignore

    if isinstance(p, str):
        if p == "fro":
            if dtype is not None:
                raise ValueError("dtype argument is not supported in frobenius norm")

            if _dim is None:
                _dim = [i for i in range(ndim)]  # noqa: C416 TODO: rewrite as list(range(m))
            if out is None:
                return _VF.frobenius_norm(input, _dim, keepdim=keepdim)  # type: ignore
            else:
                return _VF.frobenius_norm(input, _dim, keepdim=keepdim, out=out)  # type: ignore
        elif p == "nuc":
            if dtype is not None:
                raise ValueError("dtype argument is not supported in nuclear norm")
            if _dim is None:
                if out is None:
                    return _VF.nuclear_norm(input, keepdim=keepdim)  # type: ignore
                else:
                    return _VF.nuclear_norm(input, keepdim=keepdim, out=out)  # type: ignore
            else:
                if out is None:
                    return _VF.nuclear_norm(input, _dim, keepdim=keepdim)  # type: ignore
                else:
                    return _VF.nuclear_norm(input, _dim, keepdim=keepdim, out=out)  # type: ignore
        raise RuntimeError(f"only valid string values are 'fro' and 'nuc', found {p}")
    else:
        if _dim is None:
            _dim = [i for i in range(ndim)]  # noqa: C416 TODO: rewrite as list(range(m))

        if out is None:
            if dtype is None:
                return _VF.norm(input, p, _dim, keepdim=keepdim)  # type: ignore
            else:
                return _VF.norm(input, p, _dim, keepdim=keepdim, dtype=dtype)  # type: ignore
        else:
            if dtype is None:
                return _VF.norm(input, p, _dim, keepdim=keepdim, out=out)  # type: ignore
            else:
                return _VF.norm(input, p, _dim, keepdim=keepdim, dtype=dtype, out=out)  # type: ignore

def chain_matmul(*matrices):
    r"""Returns the matrix product of the :math:`N` 2-D tensors. This product is efficiently computed
    using the matrix chain order algorithm which selects the order in which incurs the lowest cost in terms
    of arithmetic operations (`[CLRS]`_). Note that since this is a function to compute the product, :math:`N`
    needs to be greater than or equal to 2; if equal to 2 then a trivial matrix-matrix product is returned.
    If :math:`N` is 1, then this is a no-op - the original matrix is returned as is.


    Args:
        matrices (Tensors...): a sequence of 2 or more 2-D tensors whose product is to be determined.


    Returns:
        Tensor: if the :math:`i^{th}` tensor was of dimensions :math:`p_{i} \times p_{i + 1}`, then the product
        would be of dimensions :math:`p_{1} \times p_{N + 1}`.

    Example::

        >>> a = torch.randn(3, 4)
        >>> b = torch.randn(4, 5)
        >>> c = torch.randn(5, 6)
        >>> d = torch.randn(6, 7)
        >>> torch.chain_matmul(a, b, c, d)
        tensor([[ -2.3375,  -3.9790,  -4.1119,  -6.6577,   9.5609, -11.5095,  -3.2614],
                [ 21.4038,   3.3378,  -8.4982,  -5.2457, -10.2561,  -2.4684,   2.7163],
                [ -0.9647,  -5.8917,  -2.3213,  -5.2284,  12.8615, -12.2816,  -2.5095]])

    .. _`[CLRS]`: https://mitpress.mit.edu/books/introduction-algorithms-third-edition
    """
    if not torch.jit.is_scripting():
        if any(type(t) is not Tensor for t in matrices) and has_torch_function(matrices):
            return handle_torch_function(chain_matmul, matrices, *matrices)
    return _VF.chain_matmul(matrices)  # type: ignore


def _lu_impl(A, pivot=True, get_infos=False, out=None):
    # type: (Tensor, bool, bool, Any) -> Tuple[Tensor, Tensor, Tensor]
    r"""Computes the LU factorization of a matrix or batches of matrices
    :attr:`A`. Returns a tuple containing the LU factorization and
    pivots of :attr:`A`.  Pivoting is done if :attr:`pivot` is set to
    ``True``.

    .. note::
        The pivots returned by the function are 1-indexed. If :attr:`pivot` is ``False``,
        then the returned pivots is a tensor filled with zeros of the appropriate size.

    .. note::
        LU factorization with :attr:`pivot` = ``False`` is not available for CPU, and attempting
        to do so will throw an error. However, LU factorization with :attr:`pivot` = ``False`` is
        available for CUDA.

    .. note::
        This function does not check if the factorization was successful or not if
        :attr:`get_infos` is ``True`` since the status of the factorization is present in the
        third element of the return tuple.

    .. note::
        In the case of batches of square matrices with size less or
        equal to 32 on a CUDA device, the LU factorization is repeated
        for singular matrices due to the bug in the MAGMA library (see
        magma issue 13).

    .. note::
       ``L``, ``U``, and ``P`` can be derived using :func:`torch.lu_unpack`.

    Arguments:
        A (Tensor): the tensor to factor of size :math:`(*, m, n)`
        pivot (bool, optional): controls whether pivoting is done. Default: ``True``
        get_infos (bool, optional): if set to ``True``, returns an info IntTensor.
                                    Default: ``False``
        out (tuple, optional): optional output tuple. If :attr:`get_infos` is ``True``,
                               then the elements in the tuple are Tensor, IntTensor,
                               and IntTensor. If :attr:`get_infos` is ``False``, then the
                               elements in the tuple are Tensor, IntTensor. Default: ``None``

    Returns:
        (Tensor, IntTensor, IntTensor (optional)): A tuple of tensors containing

            - **factorization** (*Tensor*): the factorization of size :math:`(*, m, n)`

            - **pivots** (*IntTensor*): the pivots of size :math:`(*, m)`

            - **infos** (*IntTensor*, *optional*): if :attr:`get_infos` is ``True``, this is a tensor of
              size :math:`(*)` where non-zero values indicate whether factorization for the matrix or
              each minibatch has succeeded or failed

    Example::

        >>> A = torch.randn(2, 3, 3)
        >>> A_LU, pivots = torch.lu(A)
        >>> A_LU
        tensor([[[ 1.3506,  2.5558, -0.0816],
                 [ 0.1684,  1.1551,  0.1940],
                 [ 0.1193,  0.6189, -0.5497]],

                [[ 0.4526,  1.2526, -0.3285],
                 [-0.7988,  0.7175, -0.9701],
                 [ 0.2634, -0.9255, -0.3459]]])
        >>> pivots
        tensor([[ 3,  3,  3],
                [ 3,  3,  3]], dtype=torch.int32)
        >>> A_LU, pivots, info = torch.lu(A, get_infos=True)
        >>> if info.nonzero().size(0) == 0:
        ...   print('LU factorization succeeded for all samples!')
        LU factorization succeeded for all samples!
    """
    # If get_infos is True, then we don't need to check for errors and vice versa
    return torch._lu_with_info(A, pivot=pivot, check_errors=(not get_infos))


if TYPE_CHECKING:
    _ListOrSeq = Sequence[Tensor]
else:
    _ListOrSeq = List[Tensor]

def _check_list_size(out_len: int, get_infos: bool, out: _ListOrSeq) -> None:
    get_infos_int = 1 if get_infos else 0
    if out_len - get_infos_int != 2:
        raise TypeError(f"expected tuple of {2 + int(get_infos)} elements but got {out_len}")
    if not isinstance(out, (tuple, list)):
        raise TypeError(f"argument 'out' must be tuple of Tensors, not {type(out).__name__}")

def _lu_with_infos(A, pivot=True, get_infos=False, out=None):
    # type: (Tensor, bool, bool, Optional[Tuple[Tensor, Tensor, Tensor]]) -> Tuple[Tensor, Tensor, Tensor]
    if not torch.jit.is_scripting():
        if type(A) is not Tensor and has_torch_function((A,)):
            return handle_torch_function(
                lu, (A,), A, pivot=pivot, get_infos=get_infos, out=out)
    result = _lu_impl(A, pivot, get_infos, out)
    if out is not None:
        _check_list_size(len(out), get_infos, out)
        for i in range(len(out)):
            out[i].resize_as_(result[i]).copy_(result[i])
        return out
    else:
        return result  # A_LU, pivots, infos

def _lu_no_infos(A, pivot=True, get_infos=False, out=None):
    # type: (Tensor, bool, bool, Optional[Tuple[Tensor, Tensor]]) -> Tuple[Tensor, Tensor]
    # need to check for torch_function here so that we exit if
    if not torch.jit.is_scripting():
        if type(A) is not Tensor and has_torch_function((A,)):
            return handle_torch_function(
                lu, (A,), A, pivot=pivot, get_infos=get_infos, out=out)
    result = _lu_impl(A, pivot, get_infos, out)
    if out is not None:
        _check_list_size(len(out), get_infos, out)
        for i in range(len(out)):
            out[i].resize_as_(result[i]).copy_(result[i])
        return out
    else:
        return result[0], result[1]  # A_LU, pivots

# The return type of lu depends on `get_infos`, so in order to resolve the output type
# of lu in TorchScript we need to statically know the value of `get_infos`
lu = boolean_dispatch(
    arg_name='get_infos',
    arg_index=2,
    default=False,
    if_true=_lu_with_infos,
    if_false=_lu_no_infos,
    module_name=__name__,
    func_name='lu')
lu.__doc__ = _lu_impl.__doc__

def align_tensors(*tensors):
    raise RuntimeError('`align_tensors` not yet implemented.')


================================================
FILE: patches/pytorch/1.7.0/functional.py
================================================
from typing import (
    Tuple, Optional, Union, Any, Sequence, TYPE_CHECKING
)

import librosa  # STFT patch for aarch64
import numpy as np

import torch
import torch.nn.functional as F
from torch.types import _size
from ._lowrank import svd_lowrank, pca_lowrank
from .overrides import has_torch_function, handle_torch_function
from ._jit_internal import boolean_dispatch, List
from ._jit_internal import _overload as overload

Tensor = torch.Tensor
from torch import _VF

__all__ = [
    'atleast_1d',
    'atleast_2d',
    'atleast_3d',
    'align_tensors',
    'broadcast_tensors',
    'cartesian_prod',
    'block_diag',
    'cdist',
    'chain_matmul',
    'einsum',
    'istft',
    'lu',
    'lu_unpack',
    'norm',
    'meshgrid',
    'pca_lowrank',
    'split',
    'stft',
    'svd_lowrank',
    'tensordot',
    'unique',
    'unique_consecutive',
]


def broadcast_tensors(*tensors):
    r"""broadcast_tensors(*tensors) -> List of Tensors

    Broadcasts the given tensors according to :ref:`broadcasting-semantics`.

    Args:
        *tensors: any number of tensors of the same type

    .. warning::

        More than one element of a broadcasted tensor may refer to a single
        memory location. As a result, in-place operations (especially ones that
        are vectorized) may result in incorrect behavior. If you need to write
        to the tensors, please clone them first.

    Example::

        >>> x = torch.arange(3).view(1, 3)
        >>> y = torch.arange(2).view(2, 1)
        >>> a, b = torch.broadcast_tensors(x, y)
        >>> a.size()
        torch.Size([2, 3])
        >>> a
        tensor([[0, 1, 2],
                [0, 1, 2]])
    """
    if not torch.jit.is_scripting():
        if any(type(t) is not Tensor for t in tensors) and has_torch_function(tensors):
            return handle_torch_function(broadcast_tensors, tensors, *tensors)
    return _VF.broadcast_tensors(tensors)  # type: ignore


def split(tensor, split_size_or_sections, dim=0):
    r"""Splits the tensor into chunks. Each chunk is a view of the original tensor.

    If :attr:`split_size_or_sections` is an integer type, then :attr:`tensor` will
    be split into equally sized chunks (if possible). Last chunk will be smaller if
    the tensor size along the given dimension :attr:`dim` is not divisible by
    :attr:`split_size`.

    If :attr:`split_size_or_sections` is a list, then :attr:`tensor` will be split
    into ``len(split_size_or_sections)`` chunks with sizes in :attr:`dim` according
    to :attr:`split_size_or_sections`.

    Arguments:
        tensor (Tensor): tensor to split.
        split_size_or_sections (int) or (list(int)): size of a single chunk or
            list of sizes for each chunk
        dim (int): dimension along which to split the tensor.

    Example::
        >>> a = torch.arange(10).reshape(5,2)
        >>> a
        tensor([[0, 1],
                [2, 3],
                [4, 5],
                [6, 7],
                [8, 9]])
        >>> torch.split(a, 2)
        (tensor([[0, 1],
                 [2, 3]]),
         tensor([[4, 5],
                 [6, 7]]),
         tensor([[8, 9]]))
        >>> torch.split(a, [1,4])
        (tensor([[0, 1]]),
         tensor([[2, 3],
                 [4, 5],
                 [6, 7],
                 [8, 9]]))
    """
    if not torch.jit.is_scripting():
        if type(tensor) is not Tensor and has_torch_function((tensor,)):
            return handle_torch_function(split, (tensor,), tensor, split_size_or_sections,
                                         dim=dim)
    # Overwriting reason:
    # This dispatches to two ATen functions depending on the type of
    # split_size_or_sections. The branching code is in tensor.py, which we
    # call here.
    return tensor.split(split_size_or_sections, dim)


if TYPE_CHECKING:
    _Indices = _size
else:
    _Indices = List[int]


# equivalent to itertools.product(indices)
def _indices_product(indices: _Indices) -> List[List[int]]:
    empty_list = torch.jit.annotate(List[int], [])
    result = [empty_list]
    for idx in indices:
        result_temp = torch.jit.annotate(List[List[int]], [])
        for res in result:
            for i in range(idx):
                result_temp.append(res + [i])
        result = result_temp
    return result


def _index_tensor_with_indices_list(tensor, indices):
    # type: (Tensor, List[int]) -> Tensor
    out = tensor
    for index in indices:
        out = out[index]
    return out


def lu_unpack(LU_data, LU_pivots, unpack_data=True, unpack_pivots=True):
    # type: (Tensor, Tensor, bool, bool) ->  (Tuple[Optional[Tensor], Optional[Tensor], Optional[Tensor]])
    r"""Unpacks the data and pivots from a LU factorization of a tensor.

    Returns a tuple of tensors as ``(the pivots, the L tensor, the U tensor)``.

    Arguments:
        LU_data (Tensor): the packed LU factorization data
        LU_pivots (Tensor): the packed LU factorization pivots
        unpack_data (bool): flag indicating if the data should be unpacked
        unpack_pivots (bool): flag indicating if the pivots should be unpacked

    Examples::

        >>> A = torch.randn(2, 3, 3)
        >>> A_LU, pivots = A.lu()
        >>> P, A_L, A_U = torch.lu_unpack(A_LU, pivots)
        >>>
        >>> # can recover A from factorization
        >>> A_ = torch.bmm(P, torch.bmm(A_L, A_U))

        >>> # LU factorization of a rectangular matrix:
        >>> A = torch.randn(2, 3, 2)
        >>> A_LU, pivots = A.lu()
        >>> P, A_L, A_U = torch.lu_unpack(A_LU, pivots)
        >>> P
        tensor([[[1., 0., 0.],
                 [0., 1., 0.],
                 [0., 0., 1.]],

                [[0., 0., 1.],
                 [0., 1., 0.],
                 [1., 0., 0.]]])
        >>> A_L
        tensor([[[ 1.0000,  0.0000],
                 [ 0.4763,  1.0000],
                 [ 0.3683,  0.1135]],

                [[ 1.0000,  0.0000],
                 [ 0.2957,  1.0000],
                 [-0.9668, -0.3335]]])
        >>> A_U
        tensor([[[ 2.1962,  1.0881],
                 [ 0.0000, -0.8681]],

                [[-1.0947,  0.3736],
                 [ 0.0000,  0.5718]]])
        >>> A_ = torch.bmm(P, torch.bmm(A_L, A_U))
        >>> torch.norm(A_ - A)
        tensor(2.9802e-08)
    """
    if not torch.jit.is_scripting():
        tens_ops = (LU_data, LU_pivots)
        if any([type(t) is not Tensor for t in tens_ops]) and has_torch_function(tens_ops):
            return handle_torch_function(
                lu_unpack, tens_ops, LU_data, LU_pivots, unpack_data=unpack_data,
                unpack_pivots=unpack_pivots)
    shape = LU_data.shape
    # In generalized LU factorization, the following shape relations hold:
    #   A.shape[-2:] == (m, n)
    #   P.shape[-2:] == (m, m)
    #   L.shape[-2:] == (m, k)
    #   U.shape[-2:] == (k, n)
    # where k = min(m, n)
    m, n = shape[-2:]
    k = min(m, n)
    if unpack_data:
        U: Optional[Tensor] = LU_data.triu()
        assert U is not None
        if m != k:
            U = U.narrow(-2, 0, k)
        L: Optional[Tensor] = LU_data.tril()
        assert L is not None
        if k != n:
            L = L.narrow(-1, 0, k)
        L.diagonal(dim1=-2, dim2=-1).fill_(1)
    else:
        L = U = None

    if unpack_pivots:
        LU_pivots_zero_idx = LU_pivots - 1
        if LU_data.dim() > 2:
            P: Optional[Tensor] = torch.eye(m, device=LU_data.device,
                                            dtype=LU_data.dtype) \
                .expand(shape[:-1] + (m,)) \
                .clone(memory_format=torch.contiguous_format)
            assert P is not None

            # TODO: rewrite when TorchScript supports product and map as
            # product(*map(lambda x: list(range(x)), shape[:-2])) when issue 33781 is fixed
            indices = _indices_product(shape[:-2])
            for idx in indices:
                final_order = [i for i in range(m)]  # noqa: C416 TODO: rewrite as list(range(m))
                for k, j in enumerate(_index_tensor_with_indices_list(LU_pivots_zero_idx, idx)):
                    final_order[k], final_order[j] = final_order[j], final_order[k]
                # TODO: remove _index_tensor_with_indices_list when TorchScript supports indexing Tensor with list
                p_idx = _index_tensor_with_indices_list(P, idx)
                p_idx.copy_(p_idx.index_select(1, torch.as_tensor(final_order, device=LU_pivots.device)))
        else:
            P = torch.eye(m, device=LU_data.device, dtype=LU_data.dtype)
            final_order = [i for i in range(m)]  # noqa: C416 TODO: rewrite as list(range(m))
            for k, j, in enumerate(LU_pivots_zero_idx):
                final_order[k], final_order[j] = final_order[j], final_order[k]
            P = P.index_select(1, torch.as_tensor(final_order, device=LU_pivots.device))
    else:
        P = None

    return P, L, U


def einsum(equation, *operands):
    r"""einsum(equation, *operands) -> Tensor

This function provides a way of computing multilinear expressions (i.e. sums of products) using the
Einstein summation convention.

Args:
    equation (string): The equation is given in terms of lower case letters (indices) to be associated
           with each dimension of the operands and result. The left hand side lists the operands
           dimensions, separated by commas. There should be one index letter per tensor dimension.
           The right hand side follows after `->` and gives the indices for the output.
           If the `->` and right hand side are omitted, it implicitly defined as the alphabetically
           sorted list of all indices appearing exactly once in the left hand side.
           The indices not apprearing in the output are summed over after multiplying the operands
           entries.
           If an index appears several times for the same operand, a diagonal is taken.
           Ellipses `...` represent a fixed number of dimensions. If the right hand side is inferred,
           the ellipsis dimensions are at the beginning of the output.
    operands (Tensor): The operands to compute the Einstein sum of.

.. note::

    This function does not optimize the given expression, so a different formula for the same computation may
    run faster or consume less memory. Projects like opt_einsum (https://optimized-einsum.readthedocs.io/en/stable/)
    can optimize the formula for you.

Examples::

    >>> x = torch.randn(5)
    >>> y = torch.randn(4)
    >>> torch.einsum('i,j->ij', x, y)  # outer product
    tensor([[-0.0570, -0.0286, -0.0231,  0.0197],
            [ 1.2616,  0.6335,  0.5113, -0.4351],
            [ 1.4452,  0.7257,  0.5857, -0.4984],
            [-0.4647, -0.2333, -0.1883,  0.1603],
            [-1.1130, -0.5588, -0.4510,  0.3838]])


    >>> A = torch.randn(3,5,4)
    >>> l = torch.randn(2,5)
    >>> r = torch.randn(2,4)
    >>> torch.einsum('bn,anm,bm->ba', l, A, r) # compare torch.nn.functional.bilinear
    tensor([[-0.3430, -5.2405,  0.4494],
            [ 0.3311,  5.5201, -3.0356]])


    >>> As = torch.randn(3,2,5)
    >>> Bs = torch.randn(3,5,4)
    >>> torch.einsum('bij,bjk->bik', As, Bs) # batch matrix multiplication
    tensor([[[-1.0564, -1.5904,  3.2023,  3.1271],
             [-1.6706, -0.8097, -0.8025, -2.1183]],

            [[ 4.2239,  0.3107, -0.5756, -0.2354],
             [-1.4558, -0.3460,  1.5087, -0.8530]],

            [[ 2.8153,  1.8787, -4.3839, -1.2112],
             [ 0.3728, -2.1131,  0.0921,  0.8305]]])

    >>> A = torch.randn(3, 3)
    >>> torch.einsum('ii->i', A) # diagonal
    tensor([-0.7825,  0.8291, -0.1936])

    >>> A = torch.randn(4, 3, 3)
    >>> torch.einsum('...ii->...i', A) # batch diagonal
    tensor([[-1.0864,  0.7292,  0.0569],
            [-0.9725, -1.0270,  0.6493],
            [ 0.5832, -1.1716, -1.5084],
            [ 0.4041, -1.1690,  0.8570]])

    >>> A = torch.randn(2, 3, 4, 5)
    >>> torch.einsum('...ij->...ji', A).shape # batch permute
    torch.Size([2, 3, 5, 4])
"""
    if not torch.jit.is_scripting():
        if any(type(t) is not Tensor for t in operands) and has_torch_function(operands):
            return handle_torch_function(einsum, operands, equation, *operands)
    if len(operands) == 1 and isinstance(operands[0], (list, tuple)):
        # the old interface of passing the operands as one list argument
        _operands = operands[0]
        # recurse incase operands contains value that has torch function
        # in the original implementation this line is omitted
        return einsum(equation, *_operands)

    return _VF.einsum(equation, operands)  # type: ignore


if TYPE_CHECKING:
    # The JIT doesn't understand Union, so only add type annotation for mypy
    def meshgrid(*tensors: Union[Tensor, List[Tensor]]) -> Tuple[Tensor, ...]:
        return _meshgrid(*tensors)
else:
    def meshgrid(*tensors):
        return _meshgrid(*tensors)


def _meshgrid(*tensors):
    r"""Take :math:`N` tensors, each of which can be either scalar or 1-dimensional
vector, and create :math:`N` N-dimensional grids, where the :math:`i` :sup:`th` grid is defined by
expanding the :math:`i` :sup:`th` input over dimensions defined by other inputs.


    Args:
        tensors (list of Tensor): list of scalars or 1 dimensional tensors. Scalars will be
        treated as tensors of size :math:`(1,)` automatically

    Returns:
        seq (sequence of Tensors): If the input has :math:`k` tensors of size
        :math:`(N_1,), (N_2,), \ldots , (N_k,)`, then the output would also have :math:`k` tensors,
        where all tensors are of size :math:`(N_1, N_2, \ldots , N_k)`.

    Example::

        >>> x = torch.tensor([1, 2, 3])
        >>> y = torch.tensor([4, 5, 6])
        >>> grid_x, grid_y = torch.meshgrid(x, y)
        >>> grid_x
        tensor([[1, 1, 1],
                [2, 2, 2],
                [3, 3, 3]])
        >>> grid_y
        tensor([[4, 5, 6],
                [4, 5, 6],
                [4, 5, 6]])
    """
    if not torch.jit.is_scripting():
        if any(type(t) is not Tensor for t in tensors) and has_torch_function(tensors):
            return handle_torch_function(meshgrid, tensors, *tensors)
    if len(tensors) == 1 and isinstance(tensors[0], (list, tuple)):
        # the old interface of passing the operands as one list argument
        tensors = tensors[0]  # type: ignore
    return _VF.meshgrid(tensors)  # type: ignore


def stft(input: Tensor, n_fft: int, hop_length: Optional[int] = None,
         win_length: Optional[int] = None, window: Optional[Tensor] = None,
         center: bool = True, pad_mode: str = 'reflect', normalized: bool = False,
         onesided: Optional[bool] = None,
         return_complex: Optional[bool] = None) -> Tensor:
    r"""Short-time Fourier transform (STFT).

    .. warning::
        Setting :attr:`return_complex` explicitly will be required in a future
        PyTorch release. Set it to False to preserve the current behavior or
        True to return a complex output.

    The STFT computes the Fourier transform of short overlapping windows of the
    input. This giving frequency components of the signal as they change over
    time. The interface of this function is modeled after the librosa_ stft function.

    .. _librosa: https://librosa.org/doc/latest/generated/librosa.stft.html

    Ignoring the optional batch dimension, this method computes the following
    expression:

    .. math::
        X[m, \omega] = \sum_{k = 0}^{\text{win\_length-1}}%
                            \text{window}[k]\ \text{input}[m \times \text{hop\_length} + k]\ %
                            \exp\left(- j \frac{2 \pi \cdot \omega k}{\text{win\_length}}\right),

    where :math:`m` is the index of the sliding window, and :math:`\omega` is
    the frequency that :math:`0 \leq \omega < \text{n\_fft}`. When
    :attr:`onesided` is the default value ``True``,

    * :attr:`input` must be either a 1-D time sequence or a 2-D batch of time
      sequences.

    * If :attr:`hop_length` is ``None`` (default), it is treated as equal to
      ``floor(n_fft / 4)``.

    * If :attr:`win_length` is ``None`` (default), it is treated as equal to
      :attr:`n_fft`.

    * :attr:`window` can be a 1-D tensor of size :attr:`win_length`, e.g., from
      :meth:`torch.hann_window`. If :attr:`window` is ``None`` (default), it is
      treated as if having :math:`1` everywhere in the window. If
      :math:`\text{win\_length} < \text{n\_fft}`, :attr:`window` will be padded on
      both sides to length :attr:`n_fft` before being applied.

    * If :attr:`center` is ``True`` (default), :attr:`input` will be padded on
      both sides so that the :math:`t`-th frame is centered at time
      :math:`t \times \text{hop\_length}`. Otherwise, the :math:`t`-th frame
      begins at time  :math:`t \times \text{hop\_length}`.

    * :attr:`pad_mode` determines the padding method used on :attr:`input` when
      :attr:`center` is ``True``. See :meth:`torch.nn.functional.pad` for
      all available options. Default is ``"reflect"``.

    * If :attr:`onesided` is ``True`` (default for real input), only values for
      :math:`\omega` in :math:`\left[0, 1, 2, \dots, \left\lfloor
      \frac{\text{n\_fft}}{2} \right\rfloor + 1\right]` are returned because
      the real-to-complex Fourier transform satisfies the conjugate symmetry,
      i.e., :math:`X[m, \omega] = X[m, \text{n\_fft} - \omega]^*`.
      Note if the input or window tensors are complex, then :attr:`onesided`
      output is not possible.

    * If :attr:`normalized` is ``True`` (default is ``False``), the function
      returns the normalized STFT results, i.e., multiplied by :math:`(\text{frame\_length})^{-0.5}`.

    * If :attr:`return_complex` is ``True`` (default if input is complex), the
      return is a ``input.dim() + 1`` dimensional complex tensor. If ``False``,
      the output is a ``input.dim() + 2`` dimensional real tensor where the last
      dimension represents the real and imaginary components.

    Returns either a complex tensor of size :math:`(* \times N \times T)` if
    :attr:`return_complex` is true, or a real tensor of size :math:`(* \times N
    \times T \times 2)`. Where :math:`*` is the optional batch size of
    :attr:`input`, :math:`N` is the number of frequencies where STFT is applied
    and :math:`T` is the total number of frames used.

    .. warning::
      This function changed signature at version 0.4.1. Calling with the
      previous signature may cause error or return incorrect result.

    Arguments:
        input (Tensor): the input tensor
        n_fft (int): size of Fourier transform
        hop_length (int, optional): the distance between neighboring sliding window
            frames. Default: ``None`` (treated as equal to ``floor(n_fft / 4)``)
        win_length (int, optional): the size of window frame and STFT filter.
            Default: ``None``  (treated as equal to :attr:`n_fft`)
        window (Tensor, optional): the optional window function.
            Default: ``None`` (treated as window of all :math:`1` s)
        center (bool, optional): whether to pad :attr:`input` on both sides so
            that the :math:`t`-th frame is centered at time :math:`t \times \text{hop\_length}`.
            Default: ``True``
        pad_mode (string, optional): controls the padding method used when
            :attr:`center` is ``True``. Default: ``"reflect"``
        normalized (bool, optional): controls whether to return the normalized STFT results
             Default: ``False``
        onesided (bool, optional): controls whether to return half of results to
            avoid redundancy for real inputs.
            Default: ``True`` for real :attr:`input` and :attr:`window`, ``False`` otherwise.
        return_complex (bool, optional): whether to return a complex tensor, or
            a real tensor with an extra last dimension for the real and
            imaginary components.

    Returns:
        Tensor: A tensor containing the STFT result with shape described above

    """
    if not torch.jit.is_scripting():
        if type(input) is not Tensor and has_torch_function((input,)):
            return handle_torch_function(
                stft, (input,), input, n_fft, hop_length=hop_length, win_length=win_length,
                window=window, center=center, pad_mode=pad_mode, normalized=normalized,
                onesided=onesided, return_complex=return_complex)
    # TODO: after having proper ways to map Python strings to ATen Enum, move
    #       this and F.pad to ATen.
    if center:
        signal_dim = input.dim()
        extended_shape = [1] * (3 - signal_dim) + list(input.size())
        pad = int(n_fft // 2)
        input = F.pad(input.view(extended_shape), (pad, pad), pad_mode)
        input = input.view(input.shape[-signal_dim:])
        
    # STFT patch for aarch64
    # https://stackoverflow.com/a/66872148
    librosa_stft = librosa.stft(input.cpu().detach().numpy().reshape(-1), n_fft, hop_length, win_length, window="hann", center=center, pad_mode=pad_mode)
    librosa_stft = np.array([[a.real, a.imag] for a in librosa_stft])
    librosa_stft = np.transpose(librosa_stft, axes=[0, 2, 1])
    librosa_stft = np.expand_dims(librosa_stft, 0)
    librosa_stft = torch.from_numpy(librosa_stft)
    return librosa_stft
    #return _VF.stft(input, n_fft, hop_length, win_length, window,  # type: ignore
    #                normalized, onesided, return_complex)

def istft(input: Tensor, n_fft: int, hop_length: Optional[int] = None,
          win_length: Optional[int] = None, window: Optional[Tensor] = None,
          center: bool = True, normalized: bool = False,
          onesided: Optional[bool] = None, length: Optional[int] = None,
          return_complex: bool = False) -> Tensor:
    r"""Inverse short time Fourier Transform. This is expected to be the inverse of :func:`~torch.stft`.
    It has the same parameters (+ additional optional parameter of :attr:`length`) and it should return the
    least squares estimation of the original signal. The algorithm will check using the NOLA condition (
    nonzero overlap).

    Important consideration in the parameters :attr:`window` and :attr:`center` so that the envelop
    created by the summation of all the windows is never zero at certain point in time. Specifically,
    :math:`\sum_{t=-\infty}^{\infty} |w|^2[n-t\times hop\_length] \cancel{=} 0`.

    Since :func:`~torch.stft` discards elements at the end of the signal if they do not fit in a frame,
    ``istft`` may return a shorter signal than the original signal (can occur if :attr:`center` is False
    since the signal isn't padded).

    If :attr:`center` is ``True``, then there will be padding e.g. ``'constant'``, ``'reflect'``, etc.
    Left padding can be trimmed off exactly because they can be calculated but right padding cannot be
    calculated without additional information.

    Example: Suppose the last window is:
    ``[17, 18, 0, 0, 0]`` vs ``[18, 0, 0, 0, 0]``

    The :attr:`n_fft`, :attr:`hop_length`, :attr:`win_length` are all the same which prevents the calculation
    of right padding. These additional values could be zeros or a reflection of the signal so providing
    :attr:`length` could be useful. If :attr:`length` is ``None`` then padding will be aggressively removed
    (some loss of signal).

    [1] D. W. Griffin and J. S. Lim, "Signal estimation from modified short-time Fourier transform,"
    IEEE Trans. ASSP, vol.32, no.2, pp.236-243, Apr. 1984.

    Arguments:
        input (Tensor): The input tensor. Expected to be output of :func:`~torch.stft`,
            can either be complex (``channel``, ``fft_size``, ``n_frame``), or real
            (``channel``, ``fft_size``, ``n_frame``, 2) where the ``channel``
            dimension is optional.
        n_fft (int): Size of Fourier transform
        hop_length (Optional[int]): The distance between neighboring sliding window frames.
            (Default: ``n_fft // 4``)
        win_length (Optional[int]): The size of window frame and STFT filter. (Default: ``n_fft``)
        window (Optional[torch.Tensor]): The optional window function.
            (Default: ``torch.ones(win_length)``)
        center (bool): Whether :attr:`input` was padded on both sides so that the :math:`t`-th frame is
            centered at time :math:`t \times \text{hop\_length}`.
            (Default: ``True``)
        normalized (bool): Whether the STFT was normalized. (Default: ``False``)
        onesided (Optional[bool]): Whether the STFT was onesided.
            (Default: ``True`` if ``n_fft != fft_size`` in the input size)
        length (Optional[int]): The amount to trim the signal by (i.e. the
            original signal length). (Default: whole signal)
        return_complex (Optional[bool]):
            Whether the output should be complex, or if the input should be
            assumed to derive from a real signal and window.
            Note that this is incompatible with ``onesided=True``.
            (Default: ``False``)

    Returns:
        Tensor: Least squares estimation of the original signal of size (..., signal_length)
    """
    if not torch.jit.is_scripting():
        if type(input) is not Tensor and has_torch_function((input,)):
            return handle_torch_function(
                istft, (input,), input, n_fft, hop_length=hop_length, win_length=win_length,
                window=window, center=center, normalized=normalized, onesided=onesided,
                length=length, return_complex=return_complex)

    return _VF.istft(input, n_fft, hop_length, win_length, window, center,  # type: ignore
                     normalized, onesided, length, return_complex)


del torch.unique_dim


if TYPE_CHECKING:
    # These _impl functions return a variable number of tensors as output with
    # __torch_function__; tuple unpacking is done already rather than being
    # done by the caller of the _impl function
    _unique_impl_out = Any
else:
    _unique_impl_out = Tuple[Tensor, Tensor, Tensor]


def _unique_impl(input: Tensor, sorted: bool = True,
                 return_inverse: bool = False, return_counts: bool = False,
                 dim: Optional[int] = None) -> _unique_impl_out:
    r"""Returns the unique elements of the input tensor.

    .. note:: This function is different from :func:`torch.unique_consecutive` in the sense that
        this function also eliminates non-consecutive duplicate values.

    .. note:: Currently in the CUDA implementation and the CPU implementation when dim is specified,
        `torch.unique` always sort the tensor at the beginning regardless of the `sort` argument.
        Sorting could be slow, so if your input tensor is already sorted, it is recommended to use
        :func:`torch.unique_consecutive` which avoids the sorting.

    Arguments:
        input (Tensor): the input tensor
        sorted (bool): Whether to sort the unique elements in ascending order
            before returning as output.
        return_inverse (bool): Whether to also return the indices for where
            elements in the original input ended up in the returned unique list.
        return_counts (bool): Whether to also return the counts for each unique
            element.
        dim (int): the dimension to apply unique. If ``None``, the unique of the
            flattened input is returned. default: ``None``

    Returns:
        (Tensor, Tensor (optional), Tensor (optional)): A tensor or a tuple of tensors containing

            - **output** (*Tensor*): the output list of unique scalar elements.
            - **inverse_indices** (*Tensor*): (optional) if
              :attr:`return_inverse` is True, there will be an additional
              returned tensor (same shape as input) representing the indices
              for where elements in the original input map to in the output;
              otherwise, this function will only return a single tensor.
            - **counts** (*Tensor*): (optional) if
              :attr:`return_counts` is True, there will be an additional
              returned tensor (same shape as output or output.size(dim),
              if dim was specified) representing the number of occurrences
              for each unique value or tensor.

    Example::

        >>> output = torch.unique(torch.tensor([1, 3, 2, 3], dtype=torch.long))
        >>> output
        tensor([ 2,  3,  1])

        >>> output, inverse_indices = torch.unique(
                torch.tensor([1, 3, 2, 3], dtype=torch.long), sorted=True, return_inverse=True)
        >>> output
        tensor([ 1,  2,  3])
        >>> inverse_indices
        tensor([ 0,  2,  1,  2])

        >>> output, inverse_indices = torch.unique(
                torch.tensor([[1, 3], [2, 3]], dtype=torch.long), sorted=True, return_inverse=True)
        >>> output
        tensor([ 1,  2,  3])
        >>> inverse_indices
        tensor([[ 0,  2],
                [ 1,  2]])

    """
    if not torch.jit.is_scripting():
        if type(input) is not Tensor and has_torch_function((input,)):
            return handle_torch_function(
                unique, (input,), input, sorted=sorted, return_inverse=return_inverse,
                return_counts=return_counts, dim=dim)

    if dim is not None:
        output, inverse_indices, counts = _VF.unique_dim(  # type: ignore
            input,
            dim,
            sorted=sorted,
            return_inverse=return_inverse,
            return_counts=return_counts,
        )
    else:
        output, inverse_indices, counts = torch._unique2(
            input,
            sorted=sorted,
            return_inverse=return_inverse,
            return_counts=return_counts,
        )
    return output, inverse_indices, counts


def _unique_consecutive_impl(input: Tensor, return_inverse: bool = False,
                             return_counts: bool = False,
                             dim: Optional[int] = None) -> _unique_impl_out:
    r"""Eliminates all but the first element from every consecutive group of equivalent elements.

    .. note:: This function is different from :func:`torch.unique` in the sense that this function
        only eliminates consecutive duplicate values. This semantics is similar to `std::unique`
        in C++.

    Arguments:
        input (Tensor): the input tensor
        return_inverse (bool): Whether to also return the indices for where
            elements in the original input ended up in the returned unique list.
        return_counts (bool): Whether to also return the counts for each unique
            element.
        dim (int): the dimension to apply unique. If ``None``, the unique of the
            flattened input is returned. default: ``None``

    Returns:
        (Tensor, Tensor (optional), Tensor (optional)): A tensor or a tuple of tensors containing

            - **output** (*Tensor*): the output list of unique scalar elements.
            - **inverse_indices** (*Tensor*): (optional) if
              :attr:`return_inverse` is True, there will be an additional
              returned tensor (same shape as input) representing the indices
              for where elements in the original input map to in the output;
              otherwise, this function will only return a single tensor.
            - **counts** (*Tensor*): (optional) if
              :attr:`return_counts` is True, there will be an additional
              returned tensor (same shape as output or output.size(dim),
              if dim was specified) representing the number of occurrences
              for each unique value or tensor.

    Example::

        >>> x = torch.tensor([1, 1, 2, 2, 3, 1, 1, 2])
        >>> output = torch.unique_consecutive(x)
        >>> output
        tensor([1, 2, 3, 1, 2])

        >>> output, inverse_indices = torch.unique_consecutive(x, return_inverse=True)
        >>> output
        tensor([1, 2, 3, 1, 2])
        >>> inverse_indices
        tensor([0, 0, 1, 1, 2, 3, 3, 4])

        >>> output, counts = torch.unique_consecutive(x, return_counts=True)
        >>> output
        tensor([1, 2, 3, 1, 2])
        >>> counts
        tensor([2, 2, 1, 2, 1])
    """
    if not torch.jit.is_scripting():
        if type(input) is not Tensor and has_torch_function((input,)):
            return handle_torch_function(
                unique_consecutive, (input,), input, return_inverse=return_inverse,
                return_counts=return_counts, dim=dim)
    output, inverse_indices, counts = _VF.unique_consecutive(  # type: ignore
        input, return_inverse=return_inverse, return_counts=return_counts, dim=dim)
    return output, inverse_indices, counts


def _return_counts(input, sorted=True, return_inverse=False, return_counts=False, dim=None):
    # type: (Tensor, bool, bool, bool, Optional[int]) -> Tuple[Tensor, Tensor]

    if not torch.jit.is_scripting():
        if type(input) is not Tensor and has_torch_function((input,)):
            return _unique_impl(input, sorted, return_inverse, return_counts, dim)

    output, _, counts = _unique_impl(input, sorted, return_inverse, return_counts, dim)
    return output, counts


def _return_output(input, sorted=True, return_inverse=False, return_counts=False, dim=None):
    # type: (Tensor, bool, bool, bool, Optional[int]) -> Tensor

    if not torch.jit.is_scripting():
        if type(input) is not Tensor and has_torch_function((input,)):
            return _unique_impl(input, sorted, return_inverse, return_counts, dim)

    output, _, _ = _unique_impl(input, sorted, return_inverse, return_counts, dim)
    return output


def _return_inverse(input, sorted=True, return_inverse=False, return_counts=False, dim=None):
    # type: (Tensor, bool, bool, bool, Optional[int]) -> Tuple[Tensor, Tensor]

    if not torch.jit.is_scripting():
        if type(input) is not Tensor and has_torch_function((input,)):
            return _unique_impl(input, sorted, return_inverse, return_counts, dim)

    output, inverse_indices, _ = _unique_impl(input, sorted, return_inverse, return_counts, dim)
    return output, inverse_indices


_return_inverse_false = boolean_dispatch(
    arg_name='return_counts',
    arg_index=3,
    default=False,
    if_true=_return_counts,
    if_false=_return_output,
    module_name=__name__,
    func_name='unique')

_return_inverse_true = boolean_dispatch(
    arg_name='return_counts',
    arg_index=3,
    default=False,
    if_true=_unique_impl,
    if_false=_return_inverse,
    module_name=__name__,
    func_name='unique')

# The return type of unique depends on `return_inverse`, and `return_counts` so in order to
# resolve the output type in TorchScript we need to statically know the value of both parameters

unique = boolean_dispatch(
    arg_name='return_inverse',
    arg_index=2,
    default=False,
    if_true=_return_inverse_true,
    if_false=_return_inverse_false,
    module_name=__name__,
    func_name='unique')
unique.__doc__ = _unique_impl.__doc__


def _consecutive_return_counts(input, return_inverse=False, return_counts=False, dim=None):
    # type: (Tensor, bool, bool, Optional[int]) -> Tuple[Tensor, Tensor]

    if not torch.jit.is_scripting():
        if type(input) is not Tensor and has_torch_function((input,)):
            return _unique_consecutive_impl(input, return_inverse, return_counts, dim)

    output, _, counts = _unique_consecutive_impl(input, return_inverse, return_counts, dim)
    return output, counts


def _consecutive_return_output(input, return_inverse=False, return_counts=False, dim=None):
    # type: (Tensor, bool, bool, Optional[int]) -> Tensor

    if not torch.jit.is_scripting():
        if type(input) is not Tensor and has_torch_function((input,)):
            return _unique_consecutive_impl(input, return_inverse, return_counts, dim)

    output, _, _ = _unique_consecutive_impl(input, return_inverse, return_counts, dim)
    return output


def _consecutive_return_inverse(input, return_inverse=False, return_counts=False, dim=None):
    # type: (Tensor, bool, bool, Optional[int]) -> Tuple[Tensor, Tensor]

    if not torch.jit.is_scripting():
        if type(input) is not Tensor and has_torch_function((input,)):
            return _unique_consecutive_impl(input, return_inverse, return_counts, dim)

    output, inverse_indices, _ = _unique_consecutive_impl(input, return_inverse, return_counts, dim)
    return output, inverse_indices


_consecutive_return_inverse_false = boolean_dispatch(
    arg_name='return_counts',
    arg_index=1,
    default=False,
    if_true=_consecutive_return_counts,
    if_false=_consecutive_return_output,
    module_name=__name__,
    func_name='unique_consecutive')

_consecutive_return_inverse_true = boolean_dispatch(
    arg_name='return_counts',
    arg_index=1,
    default=False,
    if_true=_unique_consecutive_impl,
    if_false=_consecutive_return_inverse,
    module_name=__name__,
    func_name='unique_consecutive')

# The return type of unique depends on `return_inverse`, and `return_counts` so in order to
# resolve the output type in TorchScript we need to statically know the value of both parameters

unique_consecutive = boolean_dispatch(
    arg_name='return_inverse',
    arg_index=2,
    default=False,
    if_true=_consecutive_return_inverse_true,
    if_false=_consecutive_return_inverse_false,
    module_name=__name__,
    func_name='unique_consecutive')
unique_consecutive.__doc__ = _unique_consecutive_impl.__doc__


def tensordot(a, b, dims=2):
    r"""Returns a contraction of a and b over multiple dimensions.

    :attr:`tensordot` implements a generalized matrix product.

    Args:
      a (Tensor): Left tensor to contract
      b (Tensor): Right tensor to contract
      dims (int or tuple of two lists of integers): number of dimensions to
         contract or explicit lists of dimensions for :attr:`a` and
         :attr:`b` respectively

    When called with a non-negative integer argument :attr:`dims` = :math:`d`, and
    the number of dimensions of :attr:`a` and :attr:`b` is :math:`m` and :math:`n`,
    respectively, :func:`~torch.tensordot` computes

    .. math::
        r_{i_0,...,i_{m-d}, i_d,...,i_n}
          = \sum_{k_0,...,k_{d-1}} a_{i_0,...,i_{m-d},k_0,...,k_{d-1}} \times b_{k_0,...,k_{d-1}, i_d,...,i_n}.

    When called with :attr:`dims` of the list form, the given dimensions will be contracted
    in place of the last :math:`d` of :attr:`a` and the first :math:`d` of :math:`b`. The sizes
    in these dimensions must match, but :func:`~torch.tensordot` will deal with broadcasted
    dimensions.

    Examples::

        >>> a = torch.arange(60.).reshape(3, 4, 5)
        >>> b = torch.arange(24.).reshape(4, 3, 2)
        >>> torch.tensordot(a, b, dims=([1, 0], [0, 1]))
        tensor([[4400., 4730.],
                [4532., 4874.],
                [4664., 5018.],
                [4796., 5162.],
                [4928., 5306.]])

        >>> a = torch.randn(3, 4, 5, device='cuda')
        >>> b = torch.randn(4, 5, 6, device='cuda')
        >>> c = torch.tensordot(a, b, dims=2).cpu()
        tensor([[ 8.3504, -2.5436,  6.2922,  2.7556, -1.0732,  3.2741],
                [ 3.3161,  0.0704,  5.0187, -0.4079, -4.3126,  4.8744],
                [ 0.8223,  3.9445,  3.2168, -0.2400,  3.4117,  1.7780]])

    """
    if not torch.jit.is_scripting():
        if (type(a) is not Tensor or type(b) is not Tensor) and has_torch_function((a, b)):
            return handle_torch_function(tensordot, (a, b), a, b, dims=dims)
    if isinstance(dims, (list, tuple)) or \
       (isinstance(dims, torch.Tensor) and dims.numel() > 1):
        dims_a, dims_b = dims
    else:
        if isinstance(dims, torch.Tensor):
            dims = dims.item()
        if dims < 0:
            raise RuntimeError(f"tensordot expects dims >= 0, but got dims={dims}")
        dims_a = list(range(-dims, 0))
        dims_b = list(range(dims))
    return _VF.tensordot(a, b, dims_a, dims_b)  # type: ignore

def cartesian_prod(*tensors):
    """Do cartesian product of the given sequence of tensors. The behavior is similar to
    python's `itertools.product`.

    Arguments:
        *tensors: any number of 1 dimensional tensors.

    Returns:
        Tensor: A tensor equivalent to converting all the input tensors into lists,
            do `itertools.product` on these lists, and finally convert the resulting list
            into tensor.

    Example::

        >>> a = [1, 2, 3]
        >>> b = [4, 5]
        >>> list(itertools.product(a, b))
        [(1, 4), (1, 5), (2, 4), (2, 5), (3, 4), (3, 5)]
        >>> tensor_a = torch.tensor(a)
        >>> tensor_b = torch.tensor(b)
        >>> torch.cartesian_prod(tensor_a, tensor_b)
        tensor([[1, 4],
                [1, 5],
                [2, 4],
                [2, 5],
                [3, 4],
                [3, 5]])
    """
    if not torch.jit.is_scripting():
        if any(type(t) is not Tensor for t in tensors) and has_torch_function(tensors):
            return handle_torch_function(cartesian_prod, tensors, *tensors)
    return _VF.cartesian_prod(tensors)  # type: ignore

def block_diag(*tensors):
    """Create a block diagonal matrix from provided tensors.

    Arguments:
        *tensors: One or more tensors with 0, 1, or 2 dimensions.

    Returns:
        Tensor: A 2 dimensional tensor with all the input tensors arranged in
            order such that their upper left and lower right corners are
            diagonally adjacent. All other elements are set to 0.

    Example::

        >>> import torch
        >>> A = torch.tensor([[0, 1], [1, 0]])
        >>> B = torch.tensor([[3, 4, 5], [6, 7, 8]])
        >>> C = torch.tensor(7)
        >>> D = torch.tensor([1, 2, 3])
        >>> E = torch.tensor([[4], [5], [6]])
        >>> torch.block_diag(A, B, C, D, E)
        tensor([[0, 1, 0, 0, 0, 0, 0, 0, 0, 0],
                [1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                [0, 0, 3, 4, 5, 0, 0, 0, 0, 0],
                [0, 0, 6, 7, 8, 0, 0, 0, 0, 0],
                [0, 0, 0, 0, 0, 7, 0, 0, 0, 0],
                [0, 0, 0, 0, 0, 0, 1, 2, 3, 0],
                [0, 0, 0, 0, 0, 0, 0, 0, 0, 4],
                [0, 0, 0, 0, 0, 0, 0, 0, 0, 5],
                [0, 0, 0, 0, 0, 0, 0, 0, 0, 6]])
    """
    if any(type(t) is not Tensor for t in tensors) and has_torch_function(tensors):
        return handle_torch_function(block_diag, tensors, *tensors)
    return torch._C._VariableFunctions.block_diag(tensors)  # type: ignore


def cdist(x1, x2, p=2., compute_mode='use_mm_for_euclid_dist_if_necessary'):
    # type: (Tensor, Tensor, float, str) -> (Tensor)
    r"""Computes batched the p-norm distance between each pair of the two collections of row vectors.

    Args:
        x1 (Tensor): input tensor of shape :math:`B \times P \times M`.
        x2 (Tensor): input tensor of shape :math:`B \times R \times M`.
        p: p value for the p-norm distance to calculate between each vector pair
            :math:`\in [0, \infty]`.
        compute_mode:
            'use_mm_for_euclid_dist_if_necessary' - will use matrix multiplication approach to calculate
            euclidean distance (p = 2) if P > 25 or R > 25
            'use_mm_for_euclid_dist' - will always use matrix multiplication approach to calculate
            euclidean distance (p = 2)
            'donot_use_mm_for_euclid_dist' - will never use matrix multiplication approach to calculate
            euclidean distance (p = 2)
            Default: use_mm_for_euclid_dist_if_necessary.

    If x1 has shape :math:`B \times P \times M` and x2 has shape :math:`B \times R \times M` then the
    output will have shape :math:`B \times P \times R`.

    This function is equivalent to `scipy.spatial.distance.cdist(input,'minkowski', p=p)`
    if :math:`p \in (0, \infty)`. When :math:`p = 0` it is equivalent to
    `scipy.spatial.distance.cdist(input, 'hamming') * M`. When :math:`p = \infty`, the closest
    scipy function is `scipy.spatial.distance.cdist(xn, lambda x, y: np.abs(x - y).max())`.

    Example:

        >>> a = torch.tensor([[0.9041,  0.0196], [-0.3108, -2.4423], [-0.4821,  1.059]])
        >>> a
        tensor([[ 0.9041,  0.0196],
                [-0.3108, -2.4423],
                [-0.4821,  1.0590]])
        >>> b = torch.tensor([[-2.1763, -0.4713], [-0.6986,  1.3702]])
        >>> b
        tensor([[-2.1763, -0.4713],
                [-0.6986,  1.3702]])
        >>> torch.cdist(a, b, p=2)
        tensor([[3.1193, 2.0959],
                [2.7138, 3.8322],
                [2.2830, 0.3791]])
    """
    if not torch.jit.is_scripting():
        if (type(x1) is not Tensor or type(x2) is not Tensor) and has_torch_function((x1, x2)):
            return handle_torch_function(
                cdist, (x1, x2), x1, x2, p=p, compute_mode=compute_mode)
    if compute_mode == 'use_mm_for_euclid_dist_if_necessary':
        return _VF.cdist(x1, x2, p, None)  # type: ignore
    elif compute_mode == 'use_mm_for_euclid_dist':
        return _VF.cdist(x1, x2, p, 1)  # type: ignore
    elif compute_mode == 'donot_use_mm_for_euclid_dist':
        return _VF.cdist(x1, x2, p, 2)  # type: ignore
    else:
        raise ValueError(f"{compute_mode} is not a valid value for compute_mode")

def atleast_1d(*tensors):
    r"""
    Returns a 1-dimensional view of each input tensor with zero dimensions.
    Input tensors with one or more dimensions are returned as-is.

    Args:
        input (Tensor or list of Tensors)

    Returns:
        output (Tensor or tuple of Tensors)

    Example::
        >>> x = torch.randn(2)
        >>> x
        tensor([1.4584, 0.7583])
        >>> torch.atleast_1d(x)
        tensor([1.4584, 0.7583])
        >>> x = torch.tensor(1.)
        >>> x
        tensor(1.)
        >>> torch.atleast_1d(x)
        tensor([1.])
        >>> x = torch.tensor(0.5)
        >>> y = torch.tensor(1.)
        >>> torch.atleast_1d((x,y))
        (tensor([0.5000]), tensor([1.]))
    """
    if not torch.jit.is_scripting():
        if any(type(t) is not Tensor for t in tensors) and has_torch_function(tensors):
            return handle_torch_function(atleast_1d, tensors, *tensors)
    if len(tensors) == 1:
        tensors = tensors[0]
    return _VF.atleast_1d(tensors)  # type: ignore

def atleast_2d(*tensors):
    r"""
    Returns a 2-dimensional view of each each input tensor with zero dimensions.
    Input tensors with two or more dimensions are returned as-is.
    Args:
        input (Tensor or list of Tensors)

    Returns:
        output (Tensor or tuple of Tensors)

    Example::
        >>> x = torch.tensor(1.)
        >>> x
        tensor(1.)
        >>> torch.atleast_2d(x)
        tensor([[1.]])
        >>> x = torch.randn(2,2)
        >>> x
        tensor([[2.2086, 2.5165],
                [0.1757, 0.5194]])
        >>> torch.atleast_2d(x)
        tensor([[2.2086, 2.5165],
                [0.1757, 0.5194]])
        >>> x = torch.tensor(0.5)
        >>> y = torch.tensor(1.)
        >>> torch.atleast_2d((x,y))
        (tensor([[0.5000]]), tensor([[1.]]))
    """
    if not torch.jit.is_scripting():
        if any(type(t) is not Tensor for t in tensors) and has_torch_function(tensors):
            return handle_torch_function(atleast_2d, tensors, *tensors)
    if len(tensors) == 1:
        tensors = tensors[0]
    return _VF.atleast_2d(tensors)  # type: ignore

def atleast_3d(*tensors):
    r"""
    Returns a 3-dimensional view of each each input tensor with zero dimensions.
    Input tensors with three or more dimensions are returned as-is.
    Args:
        input (Tensor or list of Tensors)

    Returns:
        output (Tensor or tuple of Tensors)

    Example:

        >>> x = torch.tensor(0.5)
        >>> x
        tensor(0.5000)
        >>> torch.atleast_3d(x)
        tensor([[[0.5000]]])
        >>> y = torch.randn(2,2)
        >>> y
        tensor([[-0.8079,  0.7460],
                [-1.1647,  1.4734]])
        >>> torch.atleast_3d(y)
        tensor([[[-0.8079],
                [ 0.7460]],
                <BLANKLINE>
                [[-1.1647],
                [ 1.4734]]])
        >>> x = torch.randn(1,1,1)
        >>> x
        tensor([[[-1.5689]]])
        >>> torch.atleast_3d(x)
        tensor([[[-1.5689]]])
        >>> x = torch.tensor(0.5)
        >>> y = torch.tensor(1.)
        >>> torch.atleast_3d((x,y))
        (tensor([[[0.5000]]]), tensor([[[1.]]]))
    """
    if not torch.jit.is_scripting():
        if any(type(t) is not Tensor for t in tensors) and has_torch_function(tensors):
            return handle_torch_function(atleast_3d, tensors, *tensors)
    if len(tensors) == 1:
        tensors = tensors[0]
    return _VF.atleast_3d(tensors)  # type: ignore


if TYPE_CHECKING:
    pass
    # There's no good way to use this type annotation; cannot rename norm() to
    # _norm_impl() in a way that doesn't break JIT overloads. So leave untyped
    # for mypy for now.
    #    def norm(input: Tensor,
    #             p: Optional[Union[str, Number]] = "fro",
    #             dim: Optional[Union[int, List[int]]] = None,
    #             keepdim: bool = False,
    #             out: Optional[Tensor] = None,
    #             dtype: _dtype = None) -> Tensor:
    #        return _norm_impl(input, p, dim, keepdim, out, dtype)
else:
    # TODO: type dim as BroadcastingList when
    # https://github.com/pytorch/pytorch/issues/33782 is fixed
    @overload  # noqa: 749
    def norm(input, p="fro", dim=None, keepdim=False, out=None, dtype=None):  # noqa: 749
        # type: (Tensor, str, Optional[List[int]], bool, Optional[Tensor], Optional[int]) -> Tensor
        pass

    @overload  # noqa: 749
    def norm(input, p="fro", dim=None, keepdim=False, out=None, dtype=None):  # noqa: 749
        # type: (Tensor, Optional[number], Optional[List[int]], bool, Optional[Tensor], Optional[int]) -> Tensor
        pass

    @overload  # noqa: 749
    def norm(input, p="fro", dim=None, keepdim=False, out=None, dtype=None):  # noqa: 749
        # type: (Tensor, Optional[number], Optional[int], bool, Optional[Tensor], Optional[int]) -> Tensor
        pass

    @overload  # noqa: 749
    def norm(input, p="fro", dim=None, keepdim=False, out=None, dtype=None):  # noqa: 749
        # type: (Tensor, str, Optional[int], bool, Optional[Tensor], Optional[int]) -> Tensor
        pass


def norm(input, p="fro", dim=None, keepdim=False, out=None, dtype=None):  # noqa: 749
    r"""Returns the matrix norm or vector norm of a given tensor.

    .. warning::

        torch.norm is deprecated and may be removed in a future PyTorch release.
        Use :func:`torch.linalg.norm` instead, but note that :func:`torch.linalg.norm`
        has a different signature and slightly different behavior that is
        more consistent with NumPy's numpy.linalg.norm.

    Args:
        input (Tensor): the input tensor
        p (int, float, inf, -inf, 'fro', 'nuc', optional): the order of norm. Default: ``'fro'``
            The following norms can be calculated:

            =====  ============================  ==========================
            ord    matrix norm                   vector norm
            =====  ============================  ==========================
            None   Frobenius norm                2-norm
            'fro'  Frobenius norm                --
            'nuc'  nuclear norm                  --
            Other  as vec norm when dim is None  sum(abs(x)**ord)**(1./ord)
            =====  ============================  ==========================

        dim (int, 2-tuple of ints, 2-list of ints, optional): If it is an int,
            vector norm will be calculated, if it is 2-tuple of ints, matrix norm
            will be calculated. If the value is None, matrix norm will be calculated
            when the input tensor only has two dimensions, vector norm will be
            calculated when the input tensor only has one dimension. If the input
            tensor has more than two dimensions, the vector norm will be applied to
            last dimension.
        keepdim (bool, optional): whether the output tensors have :attr:`dim`
            retained or not. Ignored if :attr:`dim` = ``None`` and
            :attr:`out` = ``None``. Default: ``False``
        out (Tensor, optional): the output tensor. Ignored if
            :attr:`dim` = ``None`` and :attr:`out` = ``None``.
        dtype (:class:`torch.dtype`, optional): the desired data type of
            returned tensor. If specified, the input tensor is casted to
            :attr:'dtype' while performing the operation. Default: None.


    Example::

        >>> import torch
        >>> a = torch.arange(9, dtype= torch.float) - 4
        >>> b = a.reshape((3, 3))
        >>> torch.norm(a)
        tensor(7.7460)
        >>> torch.norm(b)
        tensor(7.7460)
        >>> torch.norm(a, float('inf'))
        tensor(4.)
        >>> torch.norm(b, float('inf'))
        tensor(4.)
        >>> c = torch.tensor([[ 1, 2, 3],[-1, 1, 4]] , dtype= torch.float)
        >>> torch.norm(c, dim=0)
        tensor([1.4142, 2.2361, 5.0000])
        >>> torch.norm(c, dim=1)
        tensor([3.7417, 4.2426])
        >>> torch.norm(c, p=1, dim=1)
        tensor([6., 6.])
        >>> d = torch.arange(8, dtype= torch.float).reshape(2,2,2)
        >>> torch.norm(d, dim=(1,2))
        tensor([ 3.7417, 11.2250])
        >>> torch.norm(d[0, :, :]), torch.norm(d[1, :, :])
        (tensor(3.7417), tensor(11.2250))
    """

    if not torch.jit.is_scripting():
        if type(input) is not Tensor and has_torch_function((input,)):
            return handle_torch_function(
                norm, (input,), input, p=p, dim=dim, keepdim=keepdim, out=out, dtype=dtype)

    ndim = input.dim()

    # catch default case
    if dim is None and out is None and dtype is None and p is not None:
        if isinstance(p, str):
            if p == "fro":
                return _VF.frobenius_norm(input, dim=(), keepdim=keepdim)  # type: ignore
        if not isinstance(p, str):
            _dim = [i for i in range(ndim)]  # noqa: C416 TODO: rewrite as list(range(m))
            return _VF.norm(input, p, dim=_dim, keepdim=keepdim)  # type: ignore

    # TODO: when https://github.com/pytorch/pytorch/issues/33782 is fixed
    # remove the overloads where dim is an int and replace with BraodcastingList1
    # and remove next four lines, replace _dim with dim
    if dim is not None:
        if isinstance(dim, int):
            _dim = [dim]
        else:
            _dim = dim
    else:
        _dim = None  # type: ignore

    if isinstance(p, str):
        if p == "fro":
            if dtype is not None:
                raise ValueError("dtype argument is not supported in frobenius norm")

            if _dim is None:
                _dim = [i for i in range(ndim)]  # noqa: C416 TODO: rewrite as list(range(m))
            if out is None:
                return _VF.frobenius_norm(input, _dim, keepdim=keepdim)  # type: ignore
            else:
                return _VF.frobenius_norm(input, _dim, keepdim=keepdim, out=out)  # type: ignore
        elif p == "nuc":
            if dtype is not None:
                raise ValueError("dtype argument is not supported in nuclear norm")
            if _dim is None:
                if out is None:
                    return _VF.nuclear_norm(input, keepdim=keepdim)  # type: ignore
                else:
                    return _VF.nuclear_norm(input, keepdim=keepdim, out=out)  # type: ignore
            else:
                if out is None:
                    return _VF.nuclear_norm(input, _dim, keepdim=keepdim)  # type: ignore
                else:
                    return _VF.nuclear_norm(input, _dim, keepdim=keepdim, out=out)  # type: ignore
        raise RuntimeError(f"only valid string values are 'fro' and 'nuc', found {p}")
    else:
        if _dim is None:
            _dim = [i for i in range(ndim)]  # noqa: C416 TODO: rewrite as list(range(m))

        if out is None:
            if dtype is None:
                return _VF.norm(input, p, _dim, keepdim=keepdim)  # type: ignore
            else:
                return _VF.norm(input, p, _dim, keepdim=keepdim, dtype=dtype)  # type: ignore
        else:
            if dtype is None:
                return _VF.norm(input, p, _dim, keepdim=keepdim, out=out)  # type: ignore
            else:
                return _VF.norm(input, p, _dim, keepdim=keepdim, dtype=dtype, out=out)  # type: ignore

def chain_matmul(*matrices):
    r"""Returns the matrix product of the :math:`N` 2-D tensors. This product is efficiently computed
    using the matrix chain order algorithm which selects the order in which incurs the lowest cost in terms
    of arithmetic operations (`[CLRS]`_). Note that since this is a function to compute the product, :math:`N`
    needs to be greater than or equal to 2; if equal to 2 then a trivial matrix-matrix product is returned.
    If :math:`N` is 1, then this is a no-op - the original matrix is returned as is.


    Args:
        matrices (Tensors...): a sequence of 2 or more 2-D tensors whose product is to be determined.


    Returns:
        Tensor: if the :math:`i^{th}` tensor was of dimensions :math:`p_{i} \times p_{i + 1}`, then the product
        would be of dimensions :math:`p_{1} \times p_{N + 1}`.

    Example::

        >>> a = torch.randn(3, 4)
        >>> b = torch.randn(4, 5)
        >>> c = torch.randn(5, 6)
        >>> d = torch.randn(6, 7)
        >>> torch.chain_matmul(a, b, c, d)
        tensor([[ -2.3375,  -3.9790,  -4.1119,  -6.6577,   9.5609, -11.5095,  -3.2614],
                [ 21.4038,   3.3378,  -8.4982,  -5.2457, -10.2561,  -2.4684,   2.7163],
                [ -0.9647,  -5.8917,  -2.3213,  -5.2284,  12.8615, -12.2816,  -2.5095]])

    .. _`[CLRS]`: https://mitpress.mit.edu/books/introduction-algorithms-third-edition
    """
    if not torch.jit.is_scripting():
        if any(type(t) is not Tensor for t in matrices) and has_torch_function(matrices):
            return handle_torch_function(chain_matmul, matrices, *matrices)
    return _VF.chain_matmul(matrices)  # type: ignore


def _lu_impl(A, pivot=True, get_infos=False, out=None):
    # type: (Tensor, bool, bool, Any) -> Tuple[Tensor, Tensor, Tensor]
    r"""Computes the LU factorization of a matrix or batches of matrices
    :attr:`A`. Returns a tuple containing the LU factorization and
    pivots of :attr:`A`.  Pivoting is done if :attr:`pivot` is set to
    ``True``.

    .. note::
        The pivots returned by the function are 1-indexed. If :attr:`pivot` is ``False``,
        then the returned pivots is a tensor filled with zeros of the appropriate size.

    .. note::
        LU factorization with :attr:`pivot` = ``False`` is not available for CPU, and attempting
        to do so will throw an error. However, LU factorization with :attr:`pivot` = ``False`` is
        available for CUDA.

    .. note::
        This function does not check if the factorization was successful or not if
        :attr:`get_infos` is ``True`` since the status of the factorization is present in the
        third element of the return tuple.

    .. note::
        In the case of batches of square matrices with size less or
        equal to 32 on a CUDA device, the LU factorization is repeated
        for singular matrices due to the bug in the MAGMA library (see
        magma issue 13).

    .. note::
       ``L``, ``U``, and ``P`` can be derived using :func:`torch.lu_unpack`.

    Arguments:
        A (Tensor): the tensor to factor of size :math:`(*, m, n)`
        pivot (bool, optional): controls whether pivoting is done. Default: ``True``
        get_infos (bool, optional): if set to ``True``, returns an info IntTensor.
                                    Default: ``False``
        out (tuple, optional): optional output tuple. If :attr:`get_infos` is ``True``,
                               then the elements in the tuple are Tensor, IntTensor,
                               and IntTensor. If :attr:`get_infos` is ``False``, then the
                               elements in the tuple are Tensor, IntTensor. Default: ``None``

    Returns:
        (Tensor, IntTensor, IntTensor (optional)): A tuple of tensors containing

            - **factorization** (*Tensor*): the factorization of size :math:`(*, m, n)`

            - **pivots** (*IntTensor*): the pivots of size :math:`(*, m)`

            - **infos** (*IntTensor*, *optional*): if :attr:`get_infos` is ``True``, this is a tensor of
              size :math:`(*)` where non-zero values indicate whether factorization for the matrix or
              each minibatch has succeeded or failed

    Example::

        >>> A = torch.randn(2, 3, 3)
        >>> A_LU, pivots = torch.lu(A)
        >>> A_LU
        tensor([[[ 1.3506,  2.5558, -0.0816],
                 [ 0.1684,  1.1551,  0.1940],
                 [ 0.1193,  0.6189, -0.5497]],

                [[ 0.4526,  1.2526, -0.3285],
                 [-0.7988,  0.7175, -0.9701],
                 [ 0.2634, -0.9255, -0.3459]]])
        >>> pivots
        tensor([[ 3,  3,  3],
                [ 3,  3,  3]], dtype=torch.int32)
        >>> A_LU, pivots, info = torch.lu(A, get_infos=True)
        >>> if info.nonzero().size(0) == 0:
        ...   print('LU factorization succeeded for all samples!')
        LU factorization succeeded for all samples!
    """
    # If get_infos is True, then we don't need to check for errors and vice versa
    return torch._lu_with_info(A, pivot=pivot, check_errors=(not get_infos))


if TYPE_CHECKING:
    _ListOrSeq = Sequence[Tensor]
else:
    _ListOrSeq = List[Tensor]

def _check_list_size(out_len: int, get_infos: bool, out: _ListOrSeq) -> None:
    get_infos_int = 1 if get_infos else 0
    if out_len - get_infos_int != 2:
        raise TypeError(f"expected tuple of {2 + int(get_infos)} elements but got {out_len}")
    if not isinstance(out, (tuple, list)):
        raise TypeError(f"argument 'out' must be tuple of Tensors, not {type(out).__name__}")

def _lu_with_infos(A, pivot=True, get_infos=False, out=None):
    # type: (Tensor, bool, bool, Optional[Tuple[Tensor, Tensor, Tensor]]) -> Tuple[Tensor, Tensor, Tensor]
    if not torch.jit.is_scripting():
        if type(A) is not Tensor and has_torch_function((A,)):
            return handle_torch_function(
                lu, (A,), A, pivot=pivot, get_infos=get_infos, out=out)
    result = _lu_impl(A, pivot, get_infos, out)
    if out is not None:
        _check_list_size(len(out), get_infos, out)
        for i in range(len(out)):
            out[i].resize_as_(result[i]).copy_(result[i])
        return out
    else:
        return result  # A_LU, pivots, infos

def _lu_no_infos(A, pivot=True, get_infos=False, out=None):
    # type: (Tensor, bool, bool, Optional[Tuple[Tensor, Tensor]]) -> Tuple[Tensor, Tensor]
    # need to check for torch_function here so that we exit if
    if not torch.jit.is_scripting():
        if type(A) is not Tensor and has_torch_function((A,)):
            return handle_torch_function(
                lu, (A,), A, pivot=pivot, get_infos=get_infos, out=out)
    result = _lu_impl(A, pivot, get_infos, out)
    if out is not None:
        _check_list_size(len(out), get_infos, out)
        for i in range(len(out)):
            out[i].resize_as_(result[i]).copy_(result[i])
        return out
    else:
        return result[0], result[1]  # A_LU, pivots

# The return type of lu depends on `get_infos`, so in order to resolve the output type
# of lu in TorchScript we need to statically know the value of `get_infos`
lu = boolean_dispatch(
    arg_name='get_infos',
    arg_index=2,
    default=False,
    if_true=_lu_with_infos,
    if_false=_lu_no_infos,
    module_name=__name__,
    func_name='lu')
lu.__doc__ = _lu_impl.__doc__

def align_tensors(*tensors):
    raise RuntimeError('`align_tensors` not yet implemented.')


================================================
FILE: patches/transformers/4.5.0/convert_graph_to_onnx.diff
================================================
14a15,17
> import os 
> import json
> 
83a87,91
>             "--save-config",
>             action="store_true",
>             help="Save the model configuration along with the ONNX",
>         )
>         self.add_argument(
280a289,295
>         print('Exporting from PyTorch to ONNX...')
>         print('input_names', input_names)
>         print('output_names', output_names)
>         print('dynamic_axes', dynamic_axes)
>         print('tokens', tokens)
>         print('model_args', model_args)
>         
291a307
>             verbose=True
339a356
>     save_config: bool = False,
366,367c383,384
<     elif len(listdir(output.parent.as_posix())) > 0:
<         raise Exception(f"Folder {output.parent.as_posix()} is not empty, aborting conversion")
---
>     #elif len(listdir(output.parent.as_posix())) > 0:
>     #    raise Exception(f"Folder {output.parent.as_posix()} is not empty, aborting conversion")
374c391,407
< 
---
>         
>     # Save the configuration
>     if save_config:
>         config_path = os.path.splitext(output)[0] + '.json'
> 
>         config = dict(
>             model = nlp.model.config.to_dict(),
>             tokenizer = nlp.tokenizer.init_kwargs
>         )
>         
>         #nlp.model.config.to_json_file(config_path)
>         
>         with open(config_path, 'w') as config_file:
>             json.dump(config, config_file, indent=2)
>             
>         print(f"Saved config to {config_path}")
>         
468a502
>             args.save_config


================================================
FILE: patches/transformers/4.5.0/convert_graph_to_onnx.original.py
================================================
# Copyright 2020 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from argparse import ArgumentParser
from os import listdir, makedirs
from pathlib import Path
from typing import Dict, List, Optional, Tuple

from packaging.version import Version, parse

from transformers.file_utils import ModelOutput, is_tf_available, is_torch_available
from transformers.pipelines import Pipeline, pipeline
from transformers.tokenization_utils import BatchEncoding


# This is the minimal required version to
# support some ONNX Runtime features
ORT_QUANTIZE_MINIMUM_VERSION = parse("1.4.0")


SUPPORTED_PIPELINES = [
    "feature-extraction",
    "ner",
    "sentiment-analysis",
    "fill-mask",
    "question-answering",
    "text-generation",
    "translation_en_to_fr",
    "translation_en_to_de",
    "translation_en_to_ro",
]


class OnnxConverterArgumentParser(ArgumentParser):
    """
    Wraps all the script arguments supported to export transformers models to ONNX IR
    """

    def __init__(self):
        super().__init__("ONNX Converter")

        self.add_argument(
            "--pipeline",
            type=str,
            choices=SUPPORTED_PIPELINES,
            default="feature-extraction",
        )
        self.add_argument(
            "--model",
            type=str,
            required=True,
            help="Model's id or path (ex: bert-base-cased)",
        )
        self.add_argument("--tokenizer", type=str, help="Tokenizer's id or path (ex: bert-base-cased)")
        self.add_argument(
            "--framework",
            type=str,
            choices=["pt", "tf"],
            help="Framework for loading the model",
        )
        self.add_argument("--opset", type=int, default=11, help="ONNX opset to use")
        self.add_argument(
            "--check-loading",
            action="store_true",
            help="Check ONNX is able to load the model",
        )
        self.add_argument(
            "--use-external-format",
            action="store_true",
            help="Allow exporting model >= than 2Gb",
        )
        self.add_argument(
            "--quantize",
            action="store_true",
            help="Quantize the neural network to be run with int8",
        )
        self.add_argument("output")


def generate_identified_filename(filename: Path, identifier: str) -> Path:
    """
    Append a string-identifier at the end (before the extension, if any) to the provided filepath

    Args:
        filename: pathlib.Path The actual path object we would like to add an identifier suffix
        identifier: The suffix to add

    Returns: String with concatenated identifier at the end of the filename
    """
    return filename.parent.joinpath(filename.stem + identifier).with_suffix(filename.suffix)


def check_onnxruntime_requirements(minimum_version: Version):
    """
    Check onnxruntime is installed and if the installed version match is recent enough

    Raises:
        ImportError: If onnxruntime is not installed or too old version is found
    """
    try:
        import onnxruntime

        # Parse the version of the installed onnxruntime
        ort_version = parse(onnxruntime.__version__)

        # We require 1.4.0 minimum
        if ort_version < ORT_QUANTIZE_MINIMUM_VERSION:
            raise ImportError(
                f"We found an older version of onnxruntime ({onnxruntime.__version__}) "
                f"but we require onnxruntime to be >= {minimum_version} to enable all the conversions options.\n"
                f"Please update onnxruntime by running `pip install --upgrade onnxruntime`"
            )

    except ImportError:
        raise ImportError(
            "onnxruntime doesn't seem to be currently installed. "
            "Please install the onnxruntime by running `pip install onnxruntime`"
            " and relaunch the conversion."
        )


def ensure_valid_input(model, tokens, input_names):
    """
    Ensure input are presented in the correct order, without any Non

    Args:
        model: The model used to forward the input data
        tokens: BatchEncoding holding the input data
        input_names: The name of the inputs

    Returns: Tuple

    """
    print("Ensuring inputs are in correct order")

    model_args_name = model.forward.__code__.co_varnames
    model_args, ordered_input_names = [], []
    for arg_name in model_args_name[1:]:  # start at index 1 to skip "self" argument
        if arg_name in input_names:
            ordered_input_names.append(arg_name)
            model_args.append(tokens[arg_name])
        else:
            print(f"{arg_name} is not present in the generated input list.")
            break

    print(f"Generated inputs order: {ordered_input_names}")
    return ordered_input_names, tuple(model_args)


def infer_shapes(nlp: Pipeline, framework: str) -> Tuple[List[str], List[str], Dict, BatchEncoding]:
    """
    Attempt to infer the static vs dynamic axes for each input and output tensors for a specific model

    Args:
        nlp: The pipeline object holding the model to be exported
        framework: The framework identifier to dispatch to the correct inference scheme (pt/tf)

    Returns:

        - List of the inferred input variable names
        - List of the inferred output variable names
        - Dictionary with input/output variables names as key and shape tensor as value
        - a BatchEncoding reference which was used to infer all the above information
    """

    def build_shape_dict(name: str, tensor, is_input: bool, seq_len: int):
        if isinstance(tensor, (tuple, list)):
            return [build_shape_dict(name, t, is_input, seq_len) for t in tensor]

        else:
            # Let's assume batch is the first axis with only 1 element (~~ might not be always true ...)
            axes = {[axis for axis, numel in enumerate(tensor.shape) if numel == 1][0]: "batch"}
            if is_input:
                if len(tensor.shape) == 2:
                    axes[1] = "sequence"
                else:
                    raise ValueError(f"Unable to infer tensor axes ({len(tensor.shape)})")
            else:
                seq_axes = [dim for dim, shape in enumerate(tensor.shape) if shape == seq_len]
                axes.update({dim: "sequence" for dim in seq_axes})

        print(f"Found {'input' if is_input else 'output'} {name} with shape: {axes}")
        return axes

    tokens = nlp.tokenizer("This is a sample output", return_tensors=framework)
    seq_len = tokens.input_ids.shape[-1]
    outputs = nlp.model(**tokens) if framework == "pt" else nlp.model(tokens)
    if isinstance(outputs, ModelOutput):
        outputs = outputs.to_tuple()
    if not isinstance(outputs, (list, tuple)):
        outputs = (outputs,)

    # Generate input names & axes
    input_vars = list(tokens.keys())
    input_dynamic_axes = {k: build_shape_dict(k, v, True, seq_len) for k, v in tokens.items()}

    # flatten potentially grouped outputs (past for gpt2, attentions)
    outputs_flat = []
    for output in outputs:
        if isinstance(output, (tuple, list)):
            outputs_flat.extend(output)
        else:
            outputs_flat.append(output)

    # Generate output names & axes
    output_names = [f"output_{i}" for i in range(len(outputs_flat))]
    output_dynamic_axes = {k: build_shape_dict(k, v, False, seq_len) for k, v in zip(output_names, outputs_flat)}

    # Create the aggregated axes representation
    dynamic_axes = dict(input_dynamic_axes, **output_dynamic_axes)
    return input_vars, output_names, dynamic_axes, tokens


def load_graph_from_args(
    pipeline_name: str, framework: str, model: str, tokenizer: Optional[str] = None, **models_kwargs
) -> Pipeline:
    """
    Convert the set of arguments provided through the CLI to an actual pipeline reference (tokenizer + model

    Args:
        pipeline_name: The kind of pipeline to use (ner, question-answering, etc.)
        framework: The actual model to convert the pipeline from ("pt" or "tf")
        model: The model name which will be loaded by the pipeline
        tokenizer: The tokenizer name which will be loaded by the pipeline, default to the model's value

    Returns: Pipeline object

    """
    # If no tokenizer provided
    if tokenizer is None:
        tokenizer = model

    # Check the wanted framework is available
    if framework == "pt" and not is_torch_available():
        raise Exception("Cannot convert because PyTorch is not installed. Please install torch first.")
    if framework == "tf" and not is_tf_available():
        raise Exception("Cannot convert because TF is not installed. Please install tensorflow first.")

    print(f"Loading pipeline (model: {model}, tokenizer: {tokenizer})")

    # Allocate tokenizer and model
    return pipeline(pipeline_name, model=model, tokenizer=tokenizer, framework=framework, model_kwargs=models_kwargs)


def convert_pytorch(nlp: Pipeline, opset: int, output: Path, use_external_format: bool):
    """
    Export a PyTorch backed pipeline to ONNX Intermediate Representation (IR

    Args:
        nlp: The pipeline to be exported
        opset: The actual version of the ONNX operator set to use
        output: Path where will be stored the generated ONNX model
        use_external_format: Split the model definition from its parameters to allow model bigger than 2GB

    Returns:

    """
    if not is_torch_available():
        raise Exception("Cannot convert because PyTorch is not installed. Please install torch first.")

    import torch
    from torch.onnx import export

    print(f"Using framework PyTorch: {torch.__version__}")

    with torch.no_grad():
        input_names, output_names, dynamic_axes, tokens = infer_shapes(nlp, "pt")
        ordered_input_names, model_args = ensure_valid_input(nlp.model, tokens, input_names)

        export(
            nlp.model,
            model_args,
            f=output.as_posix(),
            input_names=ordered_input_names,
            output_names=output_names,
            dynamic_axes=dynamic_axes,
            do_constant_folding=True,
            use_external_data_format=use_external_format,
            enable_onnx_checker=True,
            opset_version=opset,
        )


def convert_tensorflow(nlp: Pipeline, opset: int, output: Path):
    """
    Export a TensorFlow backed pipeline to ONNX Intermediate Representation (IR

    Args:
        nlp: The pipeline to be exported
        opset: The actual version of the ONNX operator set to use
        output: Path where will be stored the generated ONNX model

    Notes: TensorFlow cannot export model bigger than 2GB due to internal constraint from TensorFlow

    """
    if not is_tf_available():
        raise Exception("Cannot convert because TF is not installed. Please install tensorflow first.")

    print("/!\\ Please note TensorFlow doesn't support exporting model > 2Gb /!\\")

    try:
        import tensorflow as tf

        from keras2onnx import __version__ as k2ov
        from keras2onnx import convert_keras, save_model

        print(f"Using framework TensorFlow: {tf.version.VERSION}, keras2onnx: {k2ov}")

        # Build
        input_names, output_names, dynamic_axes, tokens = infer_shapes(nlp, "tf")

        # Forward
        nlp.model.predict(tokens.data)
        onnx_model = convert_keras(nlp.model, nlp.model.name, target_opset=opset)
        save_model(onnx_model, output.as_posix())

    except ImportError as e:
        raise Exception(f"Cannot import {e.name} required to convert TF model to ONNX. Please install {e.name} first.")


def convert(
    framework: str,
    model: str,
    output: Path,
    opset: int,
    tokenizer: Optional[str] = None,
    use_external_format: bool = False,
    pipeline_name: str = "feature-extraction",
    **model_kwargs
):
    """
    Convert the pipeline object to the ONNX Intermediate Representation (IR) format

    Args:
        framework: The framework the pipeline is backed by ("pt" or "tf")
        model: The name of the model to load for the pipeline
        output: The path where the ONNX graph will be stored
        opset: The actual version of the ONNX operator set to use
        tokenizer: The name of the model to load for the pipeline, default to the model's name if not provided
        use_external_format: Split the model definition from its parameters to allow model bigger than 2GB (PyTorch only)
        pipeline_name: The kind of pipeline to instantiate (ner, question-answering, etc.)
        model_kwargs: Keyword arguments to be forwarded to the model constructor

    Returns:

    """
    print(f"ONNX opset version set to: {opset}")

    # Load the pipeline
    nlp = load_graph_from_args(pipeline_name, framework, model, tokenizer, **model_kwargs)

    if not output.parent.exists():
        print(f"Creating folder {output.parent}")
        makedirs(output.parent.as_posix())
    elif len(listdir(output.parent.as_posix())) > 0:
        raise Exception(f"Folder {output.parent.as_posix()} is not empty, aborting conversion")

    # Export the graph
    if framework == "pt":
        convert_pytorch(nlp, opset, output, use_external_format)
    else:
        convert_tensorflow(nlp, opset, output)


def optimize(onnx_model_path: Path) -> Path:
    """
    Load the model at the specified path and let onnxruntime look at transformations on the graph to enable all the
    optimizations possibl

    Args:
        onnx_model_path: filepath where the model binary description is stored

    Returns: Path where the optimized model binary description has been saved

    """
    from onnxruntime import InferenceSession, SessionOptions

    # Generate model name with suffix "optimized"
    opt_model_path = generate_identified_filename(onnx_model_path, "-optimized")
    sess_option = SessionOptions()
    sess_option.optimized_model_filepath = opt_model_path.as_posix()
    _ = InferenceSession(onnx_model_path.as_posix(), sess_option)

    print(f"Optimized model has been written at {opt_model_path}: \N{heavy check mark}")
    print("/!\\ Optimized model contains hardware specific operators which might not be portable. /!\\")

    return opt_model_path


def quantize(onnx_model_path: Path) -> Path:
    """
    Quantize the weights of the model from float32 to in8 to allow very efficient inference on modern CPU

    Args:
        onnx_model_path: Path to location the exported ONNX model is stored

    Returns: The Path generated for the quantized
    """
    import onnx
    from onnxruntime.quantization import QuantizationMode, quantize

    onnx_model = onnx.load(onnx_model_path.as_posix())

    # Discussed with @yufenglee from ONNX runtime, this will be address in the next release of onnxruntime
    print(
        "As of onnxruntime 1.4.0, models larger than 2GB will fail to quantize due to protobuf constraint.\n"
        "This limitation will be removed in the next release of onnxruntime."
    )

    quantized_model = quantize(
        model=onnx_model,
        quantization_mode=QuantizationMode.IntegerOps,
        force_fusions=True,
        symmetric_weight=True,
    )

    # Append "-quantized" at the end of the model's name
    quantized_model_path = generate_identified_filename(onnx_model_path, "-quantized")

    # Save model
    print(f"Quantized model has been written at {quantized_model_path}: \N{heavy check mark}")
    onnx.save_model(quantized_model, quantized_model_path.as_posix())

    return quantized_model_path


def verify(path: Path):
    from onnxruntime import InferenceSession, SessionOptions
    from onnxruntime.capi.onnxruntime_pybind11_state import RuntimeException

    print(f"Checking ONNX model loading from: {path} ...")
    try:
        onnx_options = SessionOptions()
        _ = InferenceSession(path.as_posix(), onnx_options, providers=["CPUExecutionProvider"])
        print(f"Model {path} correctly loaded: \N{heavy check mark}")
    except RuntimeException as re:
        print(f"Error while loading the model {re}: \N{heavy ballot x}")


if __name__ == "__main__":
    parser = OnnxConverterArgumentParser()
    args = parser.parse_args()

    # Make sure output is absolute path
    args.output = Path(args.output).absolute()

    try:
        print("\n====== Converting model to ONNX ======")
        # Convert
        convert(
            args.framework,
            args.model,
            args.output,
            args.opset,
            args.tokenizer,
            args.use_external_format,
            args.pipeline,
        )

        if args.quantize:
            # Ensure requirements for quantization on onnxruntime is met
            check_onnxruntime_requirements(ORT_QUANTIZE_MINIMUM_VERSION)

            # onnxruntime optimizations doesn't provide the same level of performances on TensorFlow than PyTorch
            if args.framework == "tf":
                print(
                    "\t Using TensorFlow might not provide the same optimization level compared to PyTorch.\n"
                    "\t For TensorFlow users you can try optimizing the model directly through onnxruntime_tools.\n"
                    "\t For more information, please refer to the onnxruntime documentation:\n"
                    "\t\thttps://github.com/microsoft/onnxruntime/tree/master/onnxruntime/python/tools/transformers\n"
                )

            print("\n====== Optimizing ONNX model ======")

            # Quantization works best when using the optimized version of the model
            args.optimized_output = optimize(args.output)

            # Do the quantization on the right graph
            args.quantized_output = quantize(args.optimized_output)

        # And verify
        if args.check_loading:
            print("\n====== Check exported ONNX model(s) ======")
            verify(args.output)

            if hasattr(args, "optimized_output"):
                verify(args.optimized_output)

            if hasattr(args, "quantized_output"):
                verify(args.quantized_output)

    except Exception as e:
        print(f"Error while converting the model: {e}")
        exit(1)


================================================
FILE: patches/transformers/4.5.0/convert_graph_to_onnx.py
================================================
# Copyright 2020 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os 
import json

from argparse import ArgumentParser
from os import listdir, makedirs
from pathlib import Path
from typing import Dict, List, Optional, Tuple

from packaging.version import Version, parse

from transformers.file_utils import ModelOutput, is_tf_available, is_torch_available
from transformers.pipelines import Pipeline, pipeline
from transformers.tokenization_utils import BatchEncoding


# This is the minimal required version to
# support some ONNX Runtime features
ORT_QUANTIZE_MINIMUM_VERSION = parse("1.4.0")


SUPPORTED_PIPELINES = [
    "feature-extraction",
    "ner",
    "sentiment-analysis",
    "fill-mask",
    "question-answering",
    "text-generation",
    "translation_en_to_fr",
    "translation_en_to_de",
    "translation_en_to_ro",
]


class OnnxConverterArgumentParser(ArgumentParser):
    """
    Wraps all the script arguments supported to export transformers models to ONNX IR
    """

    def __init__(self):
        super().__init__("ONNX Converter")

        self.add_argument(
            "--pipeline",
            type=str,
            choices=SUPPORTED_PIPELINES,
            default="feature-extraction",
        )
        self.add_argument(
            "--model",
            type=str,
            required=True,
            help="Model's id or path (ex: bert-base-cased)",
        )
        self.add_argument("--tokenizer", type=str, help="Tokenizer's id or path (ex: bert-base-cased)")
        self.add_argument(
            "--framework",
            type=str,
            choices=["pt", "tf"],
            help="Framework for loading the model",
        )
        self.add_argument("--opset", type=int, default=11, help="ONNX opset to use")
        self.add_argument(
            "--check-loading",
            action="store_true",
            help="Check ONNX is able to load the model",
        )
        self.add_argument(
            "--use-external-format",
            action="store_true",
            help="Allow exporting model >= than 2Gb",
        )
        self.add_argument(
            "--save-config",
            action="store_true",
            help="Save the model configuration along with the ONNX",
        )
        self.add_argument(
            "--quantize",
            action="store_true",
            help="Quantize the neural network to be run with int8",
        )
        self.add_argument("output")


def generate_identified_filename(filename: Path, identifier: str) -> Path:
    """
    Append a string-identifier at the end (before the extension, if any) to the provided filepath

    Args:
        filename: pathlib.Path The actual path object we would like to add an identifier suffix
        identifier: The suffix to add

    Returns: String with concatenated identifier at the end of the filename
    """
    return filename.parent.joinpath(filename.stem + identifier).with_suffix(filename.suffix)


def check_onnxruntime_requirements(minimum_version: Version):
    """
    Check onnxruntime is installed and if the installed version match is recent enough

    Raises:
        ImportError: If onnxruntime is not installed or too old version is found
    """
    try:
        import onnxruntime

        # Parse the version of the installed onnxruntime
        ort_version = parse(onnxruntime.__version__)

        # We require 1.4.0 minimum
        if ort_version < ORT_QUANTIZE_MINIMUM_VERSION:
            raise ImportError(
                f"We found an older version of onnxruntime ({onnxruntime.__version__}) "
                f"but we require onnxruntime to be >= {minimum_version} to enable all the conversions options.\n"
                f"Please update onnxruntime by running `pip install --upgrade onnxruntime`"
            )

    except ImportError:
        raise ImportError(
            "onnxruntime doesn't seem to be currently installed. "
            "Please install the onnxruntime by running `pip install onnxruntime`"
            " and relaunch the conversion."
        )


def ensure_valid_input(model, tokens, input_names):
    """
    Ensure input are presented in the correct order, without any Non

    Args:
        model: The model used to forward the input data
        tokens: BatchEncoding holding the input data
        input_names: The name of the inputs

    Returns: Tuple

    """
    print("Ensuring inputs are in correct order")

    model_args_name = model.forward.__code__.co_varnames
    model_args, ordered_input_names = [], []
    for arg_name in model_args_name[1:]:  # start at index 1 to skip "self" argument
        if arg_name in input_names:
            ordered_input_names.append(arg_name)
            model_args.append(tokens[arg_name])
        else:
            print(f"{arg_name} is not present in the generated input list.")
            break

    print(f"Generated inputs order: {ordered_input_names}")
    return ordered_input_names, tuple(model_args)


def infer_shapes(nlp: Pipeline, framework: str) -> Tuple[List[str], List[str], Dict, BatchEncoding]:
    """
    Attempt to infer the static vs dynamic axes for each input and output tensors for a specific model

    Args:
        nlp: The pipeline object holding the model to be exported
        framework: The framework identifier to dispatch to the correct inference scheme (pt/tf)

    Returns:

        - List of the inferred input variable names
        - List of the inferred output variable names
        - Dictionary with input/output variables names as key and shape tensor as value
        - a BatchEncoding reference which was used to infer all the above information
    """

    def build_shape_dict(name: str, tensor, is_input: bool, seq_len: int):
        if isinstance(tensor, (tuple, list)):
            return [build_shape_dict(name, t, is_input, seq_len) for t in tensor]

        else:
            # Let's assume batch is the first axis with only 1 element (~~ might not be always true ...)
            axes = {[axis for axis, numel in enumerate(tensor.shape) if numel == 1][0]: "batch"}
            if is_input:
                if len(tensor.shape) == 2:
                    axes[1] = "sequence"
                else:
                    raise ValueError(f"Unable to infer tensor axes ({len(tensor.shape)})")
            else:
                seq_axes = [dim for dim, shape in enumerate(tensor.shape) if shape == seq_len]
                axes.update({dim: "sequence" for dim in seq_axes})

        print(f"Found {'input' if is_input else 'output'} {name} with shape: {axes}")
        return axes

    tokens = nlp.tokenizer("This is a sample output", return_tensors=framework)
    seq_len = tokens.input_ids.shape[-1]
    outputs = nlp.model(**tokens) if framework == "pt" else nlp.model(tokens)
    if isinstance(outputs, ModelOutput):
        outputs = outputs.to_tuple()
    if not isinstance(outputs, (list, tuple)):
        outputs = (outputs,)

    # Generate input names & axes
    input_vars = list(tokens.keys())
    input_dynamic_axes = {k: build_shape_dict(k, v, True, seq_len) for k, v in tokens.items()}

    # flatten potentially grouped outputs (past for gpt2, attentions)
    outputs_flat = []
    for output in outputs:
        if isinstance(output, (tuple, list)):
            outputs_flat.extend(output)
        else:
            outputs_flat.append(output)

    # Generate output names & axes
    output_names = [f"output_{i}" for i in range(len(outputs_flat))]
    output_dynamic_axes = {k: build_shape_dict(k, v, False, seq_len) for k, v in zip(output_names, outputs_flat)}

    # Create the aggregated axes representation
    dynamic_axes = dict(input_dynamic_axes, **output_dynamic_axes)
    return input_vars, output_names, dynamic_axes, tokens


def load_graph_from_args(
    pipeline_name: str, framework: str, model: str, tokenizer: Optional[str] = None, **models_kwargs
) -> Pipeline:
    """
    Convert the set of arguments provided through the CLI to an actual pipeline reference (tokenizer + model

    Args:
        pipeline_name: The kind of pipeline to use (ner, question-answering, etc.)
        framework: The actual model to convert the pipeline from ("pt" or "tf")
        model: The model name which will be loaded by the pipeline
        tokenizer: The tokenizer name which will be loaded by the pipeline, default to the model's value

    Returns: Pipeline object

    """
    # If no tokenizer provided
    if tokenizer is None:
        tokenizer = model

    # Check the wanted framework is available
    if framework == "pt" and not is_torch_available():
        raise Exception("Cannot convert because PyTorch is not installed. Please install torch first.")
    if framework == "tf" and not is_tf_available():
        raise Exception("Cannot convert because TF is not installed. Please install tensorflow first.")

    print(f"Loading pipeline (model: {model}, tokenizer: {tokenizer})")

    # Allocate tokenizer and model
    return pipeline(pipeline_name, model=model, tokenizer=tokenizer, framework=framework, model_kwargs=models_kwargs)


def convert_pytorch(nlp: Pipeline, opset: int, output: Path, use_external_format: bool):
    """
    Export a PyTorch backed pipeline to ONNX Intermediate Representation (IR

    Args:
        nlp: The pipeline to be exported
        opset: The actual version of the ONNX operator set to use
        output: Path where will be stored the generated ONNX model
        use_external_format: Split the model definition from its parameters to allow model bigger than 2GB

    Returns:

    """
    if not is_torch_available():
        raise Exception("Cannot convert because PyTorch is not installed. Please install torch first.")

    import torch
    from torch.onnx import export

    print(f"Using framework PyTorch: {torch.__version__}")

    with torch.no_grad():
        input_names, output_names, dynamic_axes, tokens = infer_shapes(nlp, "pt")
        ordered_input_names, model_args = ensure_valid_input(nlp.model, tokens, input_names)

        print('Exporting from PyTorch to ONNX...')
        print('input_names', input_names)
        print('output_names', output_names)
        print('dynamic_axes', dynamic_axes)
        print('tokens', tokens)
        print('model_args', model_args)
        
        export(
            nlp.model,
            model_args,
            f=output.as_posix(),
            input_names=ordered_input_names,
            output_names=output_names,
            dynamic_axes=dynamic_axes,
            do_constant_folding=True,
            use_external_data_format=use_external_format,
            enable_onnx_checker=True,
            opset_version=opset,
            verbose=True
        )


def convert_tensorflow(nlp: Pipeline, opset: int, output: Path):
    """
    Export a TensorFlow backed pipeline to ONNX Intermediate Representation (IR

    Args:
        nlp: The pipeline to be exported
        opset: The actual version of the ONNX operator set to use
        output: Path where will be stored the generated ONNX model

    Notes: TensorFlow cannot export model bigger than 2GB due to internal constraint from TensorFlow

    """
    if not is_tf_available():
        raise Exception("Cannot convert because TF is not installed. Please install tensorflow first.")

    print("/!\\ Please note TensorFlow doesn't support exporting model > 2Gb /!\\")

    try:
        import tensorflow as tf

        from keras2onnx import __version__ as k2ov
        from keras2onnx import convert_keras, save_model

        print(f"Using framework TensorFlow: {tf.version.VERSION}, keras2onnx: {k2ov}")

        # Build
        input_names, output_names, dynamic_axes, tokens = infer_shapes(nlp, "tf")

        # Forward
        nlp.model.predict(tokens.data)
        onnx_model = convert_keras(nlp.model, nlp.model.name, target_opset=opset)
        save_model(onnx_model, output.as_posix())

    except ImportError as e:
        raise Exception(f"Cannot import {e.name} required to convert TF model to ONNX. Please install {e.name} first.")


def convert(
    framework: str,
    model: str,
    output: Path,
    opset: int,
    tokenizer: Optional[str] = None,
    use_external_format: bool = False,
    pipeline_name: str = "feature-extraction",
    save_config: bool = False,
    **model_kwargs
):
    """
    Convert the pipeline object to the ONNX Intermediate Representation (IR) format

    Args:
        framework: The framework the pipeline is backed by ("pt" or "tf")
        model: The name of the model to load for the pipeline
        output: The path where the ONNX graph will be stored
        opset: The actual version of the ONNX operator set to use
        tokenizer: The name of the model to load for the pipeline, default to the model's name if not provided
        use_external_format: Split the model definition from its parameters to allow model bigger than 2GB (PyTorch only)
        pipeline_name: The kind of pipeline to instantiate (ner, question-answering, etc.)
        model_kwargs: Keyword arguments to be forwarded to the model constructor

    Returns:

    """
    print(f"ONNX opset version set to: {opset}")

    # Load the pipeline
    nlp = load_graph_from_args(pipeline_name, framework, model, tokenizer, **model_kwargs)

    if not output.parent.exists():
        print(f"Creating folder {output.parent}")
        makedirs(output.parent.as_posix())
    #elif len(listdir(output.parent.as_posix())) > 0:
    #    raise Exception(f"Folder {output.parent.as_posix()} is not empty, aborting conversion")

    # Export the graph
    if framework == "pt":
        convert_pytorch(nlp, opset, output, use_external_format)
    else:
        convert_tensorflow(nlp, opset, output)
        
    # Save the configuration
    if save_config:
        config_path = os.path.splitext(output)[0] + '.json'

        config = dict(
            model = nlp.model.config.to_dict(),
            tokenizer = nlp.tokenizer.init_kwargs
        )
        
        #nlp.model.config.to_json_file(config_path)
        
        with open(config_path, 'w') as config_file:
            json.dump(config, config_file, indent=2)
            
        print(f"Saved config to {config_path}")
        

def optimize(onnx_model_path: Path) -> Path:
    """
    Load the model at the specified path and let onnxruntime look at transformations on the graph to enable all the
    optimizations possibl

    Args:
        onnx_model_path: filepath where the model binary description is stored

    Returns: Path where the optimized model binary description has been saved

    """
    from onnxruntime import InferenceSession, SessionOptions

    # Generate model name with suffix "optimized"
    opt_model_path = generate_identified_filename(onnx_model_path, "-optimized")
    sess_option = SessionOptions()
    sess_option.optimized_model_filepath = opt_model_path.as_posix()
    _ = InferenceSession(onnx_model_path.as_posix(), sess_option)

    print(f"Optimized model has been written at {opt_model_path}: \N{heavy check mark}")
    print("/!\\ Optimized model contains hardware specific operators which might not be portable. /!\\")

    return opt_model_path


def quantize(onnx_model_path: Path) -> Path:
    """
    Quantize the weights of the model from float32 to in8 to allow very efficient inference on modern CPU

    Args:
        onnx_model_path: Path to location the exported ONNX model is stored

    Returns: The Path generated for the quantized
    """
    import onnx
    from onnxruntime.quantization import QuantizationMode, quantize

    onnx_model = onnx.load(onnx_model_path.as_posix())

    # Discussed with @yufenglee from ONNX runtime, this will be address in the next release of onnxruntime
    print(
        "As of onnxruntime 1.4.0, models larger than 2GB will fail to quantize due to protobuf constraint.\n"
        "This limitation will be removed in the next release of onnxruntime."
    )

    quantized_model = quantize(
        model=onnx_model,
        quantization_mode=QuantizationMode.IntegerOps,
        force_fusions=True,
        symmetric_weight=True,
    )

    # Append "-quantized" at the end of the model's name
    quantized_model_path = generate_identified_filename(onnx_model_path, "-quantized")

    # Save model
    print(f"Quantized model has been written at {quantized_model_path}: \N{heavy check mark}")
    onnx.save_model(quantized_model, quantized_model_path.as_posix())

    return quantized_model_path


def verify(path: Path):
    from onnxruntime import InferenceSession, SessionOptions
    from onnxruntime.capi.onnxruntime_pybind11_state import RuntimeException

    print(f"Checking ONNX model loading from: {path} ...")
    try:
        onnx_options = SessionOptions()
        _ = InferenceSession(path.as_posix(), onnx_options, providers=["CPUExecutionProvider"])
        print(f"Model {path} correctly loaded: \N{heavy check mark}")
    except RuntimeException as re:
        print(f"Error while loading the model {re}: \N{heavy ballot x}")


if __name__ == "__main__":
    parser = OnnxConverterArgumentParser()
    args = parser.parse_args()

    # Make sure output is absolute path
    args.output = Path(args.output).absolute()

    try:
        print("\n====== Converting model to ONNX ======")
        # Convert
        convert(
            args.framework,
            args.model,
            args.output,
            args.opset,
            args.tokenizer,
            args.use_external_format,
            args.pipeline,
            args.save_config
        )

        if args.quantize:
            # Ensure requirements for quantization on onnxruntime is met
            check_onnxruntime_requirements(ORT_QUANTIZE_MINIMUM_VERSION)

            # onnxruntime optimizations doesn't provide the same level of performances on TensorFlow than PyTorch
            if args.framework == "tf":
                print(
                    "\t Using TensorFlow might not provide the same optimization level compared to PyTorch.\n"
                    "\t For TensorFlow users you can try optimizing the model directly through onnxruntime_tools.\n"
                    "\t For more information, please refer to the onnxruntime documentation:\n"
                    "\t\thttps://github.com/microsoft/onnxruntime/tree/master/onnxruntime/python/tools/transformers\n"
                )

            print("\n====== Optimizing ONNX model ======")

            # Quantization works best when using the optimized version of the model
            args.optimized_output = optimize(args.output)

            # Do the quantization on the right graph
            args.quantized_output = quantize(args.optimized_output)

        # And verify
        if args.check_loading:
            print("\n====== Check exported ONNX model(s) ======")
            verify(args.output)

            if hasattr(args, "optimized_output"):
                verify(args.optimized_output)

            if hasattr(args, "quantized_output"):
                verify(args.quantized_output)

    except Exception as e:
        print(f"Error while converting the model: {e}")
        exit(1)


================================================
FILE: patches/transformers/4.5.0/modeling_distilbert.py
================================================
# coding=utf-8
# Copyright 2019-present, the HuggingFace Inc. team, The Google AI Language Team and Facebook, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
 PyTorch DistilBERT model adapted in part from Facebook, Inc XLM model (https://github.com/facebookresearch/XLM) and in
 part from HuggingFace PyTorch version of Google AI Bert model (https://github.com/google-research/bert)
"""


import copy
import math

import numpy as np
import torch
import torch.nn as nn
from torch.nn import CrossEntropyLoss

from ...activations import gelu
from ...file_utils import (
    add_code_sample_docstrings,
    add_start_docstrings,
    add_start_docstrings_to_model_forward,
    replace_return_docstrings,
)
from ...modeling_outputs import (
    BaseModelOutput,
    MaskedLMOutput,
    MultipleChoiceModelOutput,
    QuestionAnsweringModelOutput,
    SequenceClassifierOutput,
    TokenClassifierOutput,
)
from ...modeling_utils import (
    PreTrainedModel,
    apply_chunking_to_forward,
    find_pruneable_heads_and_indices,
    prune_linear_layer,
)
from ...utils import logging
from .configuration_distilbert import DistilBertConfig


logger = logging.get_logger(__name__)
_CHECKPOINT_FOR_DOC = "distilbert-base-uncased"
_CONFIG_FOR_DOC = "DistilBertConfig"
_TOKENIZER_FOR_DOC = "DistilBertTokenizer"

DISTILBERT_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "distilbert-base-uncased",
    "distilbert-base-uncased-distilled-squad",
    "distilbert-base-cased",
    "distilbert-base-cased-distilled-squad",
    "distilbert-base-german-cased",
    "distilbert-base-multilingual-cased",
    "distilbert-base-uncased-finetuned-sst-2-english",
    # See all DistilBERT models at https://huggingface.co/models?filter=distilbert
]


# UTILS AND BUILDING BLOCKS OF THE ARCHITECTURE #


def create_sinusoidal_embeddings(n_pos, dim, out):
    position_enc = np.array([[pos / np.power(10000, 2 * (j // 2) / dim) for j in range(dim)] for pos in range(n_pos)])
    out.requires_grad = False
    out[:, 0::2] = torch.FloatTensor(np.sin(position_enc[:, 0::2]))
    out[:, 1::2] = torch.FloatTensor(np.cos(position_enc[:, 1::2]))
    out.detach_()


class Embeddings(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.word_embeddings = nn.Embedding(config.vocab_size, config.dim, padding_idx=config.pad_token_id)
        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.dim)
        if config.sinusoidal_pos_embds:
            create_sinusoidal_embeddings(
                n_pos=config.max_position_embeddings, dim=config.dim, out=self.position_embeddings.weight
            )

        self.LayerNorm = nn.LayerNorm(config.dim, eps=1e-12)
        self.dropout = nn.Dropout(config.dropout)

    def forward(self, input_ids):
        """
        Parameters:
            input_ids: torch.tensor(bs, max_seq_length) The token ids to embed.

        Returns: torch.tensor(bs, max_seq_length, dim) The embedded tokens (plus position embeddings, no token_type
        embeddings)
        """
        seq_length = input_ids.size(1)
        position_ids = torch.arange(seq_length, dtype=torch.long, device=input_ids.device)  # (max_seq_length)
        position_ids = position_ids.unsqueeze(0).expand_as(input_ids)  # (bs, max_seq_length)

        word_embeddings = self.word_embeddings(input_ids)  # (bs, max_seq_length, dim)
        position_embeddings = self.position_embeddings(position_ids)  # (bs, max_seq_length, dim)

        embeddings = word_embeddings + position_embeddings  # (bs, max_seq_length, dim)
        embeddings = self.LayerNorm(embeddings)  # (bs, max_seq_length, dim)
        embeddings = self.dropout(embeddings)  # (bs, max_seq_length, dim)
        return embeddings


class MultiHeadSelfAttention(nn.Module):
    def __init__(self, config):
        super().__init__()

        self.n_heads = config.n_heads
        self.dim = config.dim
        self.dropout = nn.Dropout(p=config.attention_dropout)

        assert self.dim % self.n_heads == 0

        self.q_lin = nn.Linear(in_features=config.dim, out_features=config.dim)
        self.k_lin = nn.Linear(in_features=config.dim, out_features=config.dim)
        self.v_lin = nn.Linear(in_features=config.dim, out_features=config.dim)
        self.out_lin = nn.Linear(in_features=config.dim, out_features=config.dim)

        self.pruned_heads = set()

    def prune_heads(self, heads):
        attention_head_size = self.dim // self.n_heads
        if len(heads) == 0:
            return
        heads, index = find_pruneable_heads_and_indices(heads, self.n_heads, attention_head_size, self.pruned_heads)
        # Prune linear layers
        self.q_lin = prune_linear_layer(self.q_lin, index)
        self.k_lin = prune_linear_layer(self.k_lin, index)
        self.v_lin = prune_linear_layer(self.v_lin, index)
        self.out_lin = prune_linear_layer(self.out_lin, index, dim=1)
        # Update hyper params
        self.n_heads = self.n_heads - len(heads)
        self.dim = attention_head_size * self.n_heads
        self.pruned_heads = self.pruned_heads.union(heads)

    def forward(self, query, key, value, mask, head_mask=None, output_attentions=False):
        """
        Parameters:
            query: torch.tensor(bs, seq_length, dim)
            key: torch.tensor(bs, seq_length, dim)
            value: torch.tensor(bs, seq_length, dim)
            mask: torch.tensor(bs, seq_length)

        Returns:
            weights: torch.tensor(bs, n_heads, seq_length, seq_length) Attention weights context: torch.tensor(bs,
            seq_length, dim) Contextualized layer. Optional: only if `output_attentions=True`
        """
        bs, q_length, dim = query.size()
        k_length = key.size(1)
        # assert dim == self.dim, f'Dimensions do not match: {dim} input vs {self.dim} configured'
        # assert key.size() == value.size()

        dim_per_head = self.dim // self.n_heads

        mask_reshp = (bs, 1, 1, k_length)

        def shape(x):
            """ separate heads """
            return x.view(bs, -1, self.n_heads, dim_per_head).transpose(1, 2)

        def unshape(x):
            """ group heads """
            return x.transpose(1, 2).contiguous().view(bs, -1, self.n_heads * dim_per_head)

        q = shape(self.q_lin(query))  # (bs, n_heads, q_length, dim_per_head)
        k = shape(self.k_lin(key))  # (bs, n_heads, k_length, dim_per_head)
        v = shape(self.v_lin(value))  # (bs, n_heads, k_length, dim_per_head)

        q = q / math.sqrt(dim_per_head)  # (bs, n_heads, q_length, dim_per_head)
        scores = torch.matmul(q, k.transpose(2, 3))  # (bs, n_heads, q_length, k_length)
        mask = mask.view(mask_reshp).expand_as(scores)  # (bs, n_heads, q_length, k_length)
        scores.masked_fill_((mask == 0), -float("inf"))  # (bs, n_heads, q_length, k_length)

        weights = nn.Softmax(dim=-1)(scores)  # (bs, n_heads, q_length, k_length)
        weights = self.dropout(weights)  # (bs, n_heads, q_length, k_length)

        # Mask heads if we want to
        if head_mask is not None:
            weights = weights * head_mask

        context = torch.matmul(weights, v)  # (bs, n_heads, q_length, dim_per_head)
        context = unshape(context)  # (bs, q_length, dim)
        context = self.out_lin(context)  # (bs, q_length, dim)

        if output_attentions:
            return (context, weights)
        else:
            return (context,)


class FFN(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.dropout = nn.Dropout(p=config.dropout)
        self.chunk_size_feed_forward = config.chunk_size_feed_forward
        self.seq_len_dim = 1
        self.lin1 = nn.Linear(in_features=config.dim, out_features=config.hidden_dim)
        self.lin2 = nn.Linear(in_features=config.hidden_dim, out_features=config.dim)
        assert config.activation in ["relu", "gelu"], f"activation ({config.activation}) must be in ['relu', 'gelu']"
        self.activation = gelu if config.activation == "gelu" else nn.ReLU()

    def forward(self, input):
        return apply_chunking_to_forward(self.ff_chunk, self.chunk_size_feed_forward, self.seq_len_dim, input)

    def ff_chunk(self, input):
        x = self.lin1(input)
        x = self.activation(x)
        x = self.lin2(x)
        x = self.dropout(x)
        return x


class TransformerBlock(nn.Module):
    def __init__(self, config):
        super().__init__()

        assert config.dim % config.n_heads == 0

        self.attention = MultiHeadSelfAttention(config)
        self.sa_layer_norm = nn.LayerNorm(normalized_shape=config.dim, eps=1e-12)

        self.ffn = FFN(config)
        self.output_layer_norm = nn.LayerNorm(normalized_shape=config.dim, eps=1e-12)

    def forward(self, x, attn_mask=None, head_mask=None, output_attentions=False):
        """
        Parameters:
            x: torch.tensor(bs, seq_length, dim)
            attn_mask: torch.tensor(bs, seq_length)

        Returns:
            sa_weights: torch.tensor(bs, n_heads, seq_length, seq_length) The attention weights ffn_output:
            torch.tensor(bs, seq_length, dim) The output of the transformer block contextualization.
        """
        # Self-Attention
        sa_output = self.attention(
            query=x,
            key=x,
            value=x,
            mask=attn_mask,
            head_mask=head_mask,
            output_attentions=output_attentions,
        )
        if output_attentions:
            sa_output, sa_weights = sa_output  # (bs, seq_length, dim), (bs, n_heads, seq_length, seq_length)
        else:  # To handle these `output_attentions` or `output_hidden_states` cases returning tuples
            assert type(sa_output) == tuple
            sa_output = sa_output[0]
        sa_output = self.sa_layer_norm(sa_output + x)  # (bs, seq_length, dim)

        # Feed Forward Network
        ffn_output = self.ffn(sa_output)  # (bs, seq_length, dim)
        ffn_output = self.output_layer_norm(ffn_output + sa_output)  # (bs, seq_length, dim)

        output = (ffn_output,)
        if output_attentions:
            output = (sa_weights,) + output
        return output


class Transformer(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.n_layers = config.n_layers

        layer = TransformerBlock(config)
        self.layer = nn.ModuleList([copy.deepcopy(layer) for _ in range(config.n_layers)])

    def forward(
        self, x, attn_mask=None, head_mask=None, output_attentions=False, output_hidden_states=False, return_dict=None
    ):  # docstyle-ignore
        """
        Parameters:
            x: torch.tensor(bs, seq_length, dim) Input sequence embedded.
            attn_mask: torch.tensor(bs, seq_length) Attention mask on the sequence.

        Returns:
            hidden_state: torch.tensor(bs, seq_length, dim) Sequence of hidden states in the last (top)
            layer all_hidden_states: Tuple[torch.tensor(bs, seq_length, dim)]
                Tuple of length n_layers with the hidden states from each layer.
                Optional: only if output_hidden_states=True
            all_attentions: Tuple[torch.tensor(bs, n_heads, seq_length, seq_length)]
                Tuple of length n_layers with the attention weights from each layer
                Optional: only if output_attentions=True
        """
        all_hidden_states = () if output_hidden_states else None
        all_attentions = () if output_attentions else None

        hidden_state = x
        for i, layer_module in enumerate(self.layer):
            if output_hidden_states:
                all_hidden_states = all_hidden_states + (hidden_state,)

            layer_outputs = layer_module(
                x=hidden_state, attn_mask=attn_mask, head_mask=head_mask[i], output_attentions=output_attentions
            )
            hidden_state = layer_outputs[-1]

            if output_attentions:
                assert len(layer_outputs) == 2
                attentions = layer_outputs[0]
                all_attentions = all_attentions + (attentions,)
            else:
                assert len(layer_outputs) == 1

        # Add last layer
        if output_hidden_states:
            all_hidden_states = all_hidden_states + (hidden_state,)

        if not return_dict:
            return tuple(v for v in [hidden_state, all_hidden_states, all_attentions] if v is not None)
        return BaseModelOutput(
            last_hidden_state=hidden_state, hidden_states=all_hidden_states, attentions=all_attentions
        )


# INTERFACE FOR ENCODER AND TASK SPECIFIC MODEL #
class DistilBertPreTrainedModel(PreTrainedModel):
    """
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    """

    config_class = DistilBertConfig
    load_tf_weights = None
    base_model_prefix = "distilbert"

    def _init_weights(self, module):
        """Initialize the weights."""
        if isinstance(module, nn.Linear):
            # Slightly different from the TF version which uses truncated_normal for initialization
            # cf https://github.com/pytorch/pytorch/pull/5617
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)


DISTILBERT_START_DOCSTRING = r"""

    This model inherits from :class:`~transformers.PreTrainedModel`. Check the superclass documentation for the generic
    methods the library implements for all its model (such as downloading or saving, resizing the input embeddings,
    pruning heads etc.)

    This model is also a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`__
    subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to
    general usage and behavior.

    Parameters:
        config (:class:`~transformers.DistilBertConfig`): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model
            weights.
"""

DISTILBERT_INPUTS_DOCSTRING = r"""
    Args:
        input_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using :class:`~transformers.DistilBertTokenizer`. See
            :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
            details.

            `What are input IDs? <../glossary.html#input-ids>`__
        attention_mask (:obj:`torch.FloatTensor` of shape :obj:`({0})`, `optional`):
            Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            `What are attention masks? <../glossary.html#attention-mask>`__
        head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`):
            Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.

        inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`({0}, hidden_size)`, `optional`):
            Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
            This is useful if you want more control over how to convert :obj:`input_ids` indices into associated
            vectors than the model's internal embedding lookup matrix.
        output_attentions (:obj:`bool`, `optional`):
            Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
            tensors for more detail.
        output_hidden_states (:obj:`bool`, `optional`):
            Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
            more detail.
        return_dict (:obj:`bool`, `optional`):
            Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
"""


@add_start_docstrings(
    "The bare DistilBERT encoder/transformer outputting raw hidden-states without any specific head on top.",
    DISTILBERT_START_DOCSTRING,
)
class DistilBertModel(DistilBertPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)

        self.embeddings = Embeddings(config)  # Embeddings
        self.transformer = Transformer(config)  # Encoder

        self.init_weights()

    def get_input_embeddings(self):
        return self.embeddings.word_embeddings

    def set_input_embeddings(self, new_embeddings):
        self.embeddings.word_embeddings = new_embeddings

    def _prune_heads(self, heads_to_prune):
        """
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        """
        for layer, heads in heads_to_prune.items():
            self.transformer.layer[layer].attention.prune_heads(heads)

    @add_start_docstrings_to_model_forward(DISTILBERT_INPUTS_DOCSTRING.format("batch_size, num_choices"))
    @add_code_sample_docstrings(
        tokenizer_class=_TOKENIZER_FOR_DOC,
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=BaseModelOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        head_mask=None,
        inputs_embeds=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
    ):
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        if input_ids is not None and inputs_embeds is not None:
            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
        elif input_ids is not None:
            input_shape = input_ids.size()
        elif inputs_embeds is not None:
            input_shape = inputs_embeds.size()[:-1]
        else:
            raise ValueError("You have to specify either input_ids or inputs_embeds")

        device = input_ids.device if input_ids is not None else inputs_embeds.device

        if attention_mask is None:
            attention_mask = torch.ones(input_shape, device=device)  # (bs, seq_length)

        # Prepare head mask if needed
        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)

        if inputs_embeds is None:
            inputs_embeds = self.embeddings(input_ids)  # (bs, seq_length, dim)
        return self.transformer(
            x=inputs_embeds,
            attn_mask=attention_mask,
            head_mask=head_mask,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )


@add_start_docstrings(
    """DistilBert Model with a `masked language modeling` head on top. """,
    DISTILBERT_START_DOCSTRING,
)
class DistilBertForMaskedLM(DistilBertPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)

        self.distilbert = DistilBertModel(config)
        self.vocab_transform = nn.Linear(config.dim, config.dim)
        self.vocab_layer_norm = nn.LayerNorm(config.dim, eps=1e-12)
        self.vocab_projector = nn.Linear(config.dim, config.vocab_size)

        self.init_weights()

        self.mlm_loss_fct = nn.CrossEntropyLoss()

    def get_output_embeddings(self):
        return self.vocab_projector

    def set_output_embeddings(self, new_embeddings):
        self.vocab_projector = new_embeddings

    @add_start_docstrings_to_model_forward(DISTILBERT_INPUTS_DOCSTRING.format("batch_size, num_choices"))
    @add_code_sample_docstrings(
        tokenizer_class=_TOKENIZER_FOR_DOC,
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=MaskedLMOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        head_mask=None,
        inputs_embeds=None,
        labels=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
    ):
        r"""
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
            Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ...,
            config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored
            (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``.
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        dlbrt_output = self.distilbert(
            input_ids=input_ids,
            attention_mask=attention_mask,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        hidden_states = dlbrt_output[0]  # (bs, seq_length, dim)
        prediction_logits = self.vocab_transform(hidden_states)  # (bs, seq_length, dim)
        prediction_logits = gelu(prediction_logits)  # (bs, seq_length, dim)
        prediction_logits = self.vocab_layer_norm(prediction_logits)  # (bs, seq_length, dim)
        prediction_logits = self.vocab_projector(prediction_logits)  # (bs, seq_length, vocab_size)

        mlm_loss = None
        if labels is not None:
            mlm_loss = self.mlm_loss_fct(prediction_logits.view(-1, prediction_logits.size(-1)), labels.view(-1))

        if not return_dict:
            output = (prediction_logits,) + dlbrt_output[1:]
            return ((mlm_loss,) + output) if mlm_loss is not None else output

        return MaskedLMOutput(
            loss=mlm_loss,
            logits=prediction_logits,
            hidden_states=dlbrt_output.hidden_states,
            attentions=dlbrt_output.attentions,
        )


@add_start_docstrings(
    """
    DistilBert Model transformer with a sequence classification/regression head on top (a linear layer on top of the
    pooled output) e.g. for GLUE tasks.
    """,
    DISTILBERT_START_DOCSTRING,
)
class DistilBertForSequenceClassification(DistilBertPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels

        self.distilbert = DistilBertModel(config)
        self.pre_classifier = nn.Linear(config.dim, config.dim)
        self.classifier = nn.Linear(config.dim, config.num_labels)
        self.dropout = nn.Dropout(config.seq_classif_dropout)

        self.init_weights()

    @add_start_docstrings_to_model_forward(DISTILBERT_INPUTS_DOCSTRING.format("batch_size, num_choices"))
    @add_code_sample_docstrings(
        tokenizer_class=_TOKENIZER_FOR_DOC,
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=SequenceClassifierOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        head_mask=None,
        inputs_embeds=None,
        labels=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
    ):
        r"""
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
            Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ...,
            config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
            If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        distilbert_output = self.distilbert(
            input_ids=input_ids,
            attention_mask=attention_mask,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        hidden_state = distilbert_output[0]  # (bs, seq_len, dim)
        pooled_output = hidden_state[:, 0]  # (bs, dim)
        pooled_output = self.pre_classifier(pooled_output)  # (bs, dim)
        pooled_output = nn.ReLU()(pooled_output)  # (bs, dim)
        pooled_output = self.dropout(pooled_output)  # (bs, dim)
        logits = self.classifier(pooled_output)  # (bs, num_labels)

        loss = None
        if labels is not None:
            if self.num_labels == 1:
                loss_fct = nn.MSELoss()
                loss = loss_fct(logits.view(-1), labels.view(-1))
            else:
                loss_fct = nn.CrossEntropyLoss()
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))

        if not return_dict:
            output = (logits,) + distilbert_output[1:]
            return ((loss,) + output) if loss is not None else output

        return SequenceClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=distilbert_output.hidden_states,
            attentions=distilbert_output.attentions,
        )


@add_start_docstrings(
    """
    DistilBert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a
    linear layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
    """,
    DISTILBERT_START_DOCSTRING,
)
class DistilBertForQuestionAnswering(DistilBertPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)

        self.distilbert = DistilBertModel(config)
        self.qa_outputs = nn.Linear(config.dim, config.num_labels)
        assert config.num_labels == 2
        self.dropout = nn.Dropout(config.qa_dropout)

        self.init_weights()

    @add_start_docstrings_to_model_forward(DISTILBERT_INPUTS_DOCSTRING.format("batch_size, num_choices"))
    @add_code_sample_docstrings(
        tokenizer_class=_TOKENIZER_FOR_DOC,
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=QuestionAnsweringModelOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        head_mask=None,
        inputs_embeds=None,
        start_positions=None,
        end_positions=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
    ):
        r"""
        start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
            Labels for position (index) of the start of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the
            sequence are not taken into account for computing the loss.
        end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
            Labels for position (index) of the end of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the
            sequence are not taken into account for computing the loss.
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        distilbert_output = self.distilbert(
            input_ids=input_ids,
            attention_mask=attention_mask,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        hidden_states = distilbert_output[0]  # (bs, max_query_len, dim)

        hidden_states = self.dropout(hidden_states)  # (bs, max_query_len, dim)
        logits = self.qa_outputs(hidden_states)  # (bs, max_query_len, 2)
        start_logits, end_logits = logits.split(1, dim=-1)
        start_logits = start_logits.squeeze(-1)  # (bs, max_query_len)
        end_logits = end_logits.squeeze(-1)  # (bs, max_query_len)

        total_loss = None
        if start_positions is not None and end_positions is not None:
            # If we are on multi-GPU, split add a dimension
            if len(start_positions.size()) > 1:
                start_positions = start_positions.squeeze(-1)
            if len(end_positions.size()) > 1:
                end_positions = end_positions.squeeze(-1)
            # sometimes the start/end positions are outside our model inputs, we ignore these terms
            ignored_index = start_logits.size(1)
            start_positions.clamp_(0, ignored_index)
            end_positions.clamp_(0, ignored_index)

            loss_fct = nn.CrossEntropyLoss(ignore_index=ignored_index)
            start_loss = loss_fct(start_logits, start_positions)
            end_loss = loss_fct(end_logits, end_positions)
            total_loss = (start_loss + end_loss) / 2

        if not return_dict:
            output = (start_logits, end_logits) + distilbert_output[1:]
            return ((total_loss,) + output) if total_loss is not None else output

        return QuestionAnsweringModelOutput(
            loss=total_loss,
            start_logits=start_logits,
            end_logits=end_logits,
            hidden_states=distilbert_output.hidden_states,
            attentions=distilbert_output.attentions,
        )


@add_start_docstrings(
    """
    DistilBert Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g.
    for Named-Entity-Recognition (NER) tasks.
    """,
    DISTILBERT_START_DOCSTRING,
)
class DistilBertForTokenClassification(DistilBertPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels

        self.distilbert = DistilBertModel(config)
        self.dropout = nn.Dropout(config.dropout)
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)

        self.init_weights()

    @add_start_docstrings_to_model_forward(DISTILBERT_INPUTS_DOCSTRING)
    @add_code_sample_docstrings(
        tokenizer_class=_TOKENIZER_FOR_DOC,
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=TokenClassifierOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        head_mask=None,
        inputs_embeds=None,
        labels=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
    ):
        r"""
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
            Labels for computing the token classification loss. Indices should be in ``[0, ..., config.num_labels -
            1]``.
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        outputs = self.distilbert(
            input_ids,
            attention_mask=attention_mask,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        sequence_output = outputs[0]

        sequence_output = self.dropout(sequence_output)
        logits = self.classifier(sequence_output)

        loss = None
        if labels is not None:
            loss_fct = CrossEntropyLoss()
            # Only keep active parts of the loss
            if attention_mask is not None:
                active_loss = attention_mask.view(-1) == 1
                active_logits = logits.view(-1, self.num_labels)
                active_labels = torch.where(
                    active_loss, labels.view(-1), torch.tensor(loss_fct.ignore_index).type_as(labels)
                )
                loss = loss_fct(active_logits, active_labels)
            else:
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))

        if not return_dict:
            output = (logits,) + outputs[1:]
            return ((loss,) + output) if loss is not None else output

        return TokenClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )


@add_start_docstrings(
    """
    DistilBert Model with a multiple choice classification head on top (a linear layer on top of the pooled output and
    a softmax) e.g. for RocStories/SWAG tasks.
    """,
    DISTILBERT_START_DOCSTRING,
)
class DistilBertForMultipleChoice(DistilBertPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)

        self.distilbert = DistilBertModel(config)
        self.pre_classifier = nn.Linear(config.dim, config.dim)
        self.classifier = nn.Linear(config.dim, 1)
        self.dropout = nn.Dropout(config.seq_classif_dropout)

        self.init_weights()

    @add_start_docstrings_to_model_forward(
        DISTILBERT_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")
    )
    @replace_return_docstrings(output_type=MultipleChoiceModelOutput, config_class=_CONFIG_FOR_DOC)
    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        head_mask=None,
        inputs_embeds=None,
        labels=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
    ):
        r"""
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
            Labels for computing the multiple choice classification loss. Indices should be in ``[0, ...,
            num_choices-1]`` where :obj:`num_choices` is the size of the second dimension of the input tensors. (See
            :obj:`input_ids` above)

        Returns:

        Examples::

            >>> from transformers import DistilBertTokenizer, DistilBertForMultipleChoice
            >>> import torch

            >>> tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')
            >>> model = DistilBertForMultipleChoice.from_pretrained('distilbert-base-cased')

            >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
            >>> choice0 = "It is eaten with a fork and a knife."
            >>> choice1 = "It is eaten while held in the hand."
            >>> labels = torch.tensor(0).unsqueeze(0)  # choice0 is correct (according to Wikipedia ;)), batch size 1

            >>> encoding = tokenizer([[prompt, choice0], [prompt, choice1]], return_tensors='pt', padding=True)
            >>> outputs = model(**{k: v.unsqueeze(0) for k,v in encoding.items()}, labels=labels) # batch size is 1

            >>> # the linear classifier still needs to be trained
            >>> loss = outputs.loss
            >>> logits = outputs.logits
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]

        input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None
        attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
        inputs_embeds = (
            inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1))
            if inputs_embeds is not None
            else None
        )

        outputs = self.distilbert(
            input_ids,
            attention_mask=attention_mask,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        hidden_state = outputs[0]  # (bs * num_choices, seq_len, dim)
        pooled_output = hidden_state[:, 0]  # (bs * num_choices, dim)
        pooled_output = self.pre_classifier(pooled_output)  # (bs * num_choices, dim)
        pooled_output = nn.ReLU()(pooled_output)  # (bs * num_choices, dim)
        pooled_output = self.dropout(pooled_output)  # (bs * num_choices, dim)
        logits = self.classifier(pooled_output)  # (bs * num_choices, 1)

        reshaped_logits = logits.view(-1, num_choices)  # (bs, num_choices)

        loss = None
        if labels is not None:
            loss_fct = CrossEntropyLoss()
            loss = loss_fct(reshaped_logits, labels)

        if not return_dict:
            output = (reshaped_logits,) + outputs[1:]
            return ((loss,) + output) if loss is not None else output

        return MultipleChoiceModelOutput(
            loss=loss,
            logits=reshaped_logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )


================================================
FILE: patches/transformers/4.5.1/convert_graph_to_onnx.diff
================================================
14a15,17
> import os 
> import json
> 
83a87,91
>             "--save-config",
>             action="store_true",
>             help="Save the model configuration along with the ONNX",
>         )
>         self.add_argument(
280a289,295
>         print('Exporting from PyTorch to ONNX...')
>         print('input_names', input_names)
>         print('output_names', output_names)
>         print('dynamic_axes', dynamic_axes)
>         print('tokens', tokens)
>         print('model_args', model_args)
>         
291a307
>             verbose=True
339a356
>     save_config: bool = False,
366,367c383,384
<     elif len(listdir(output.parent.as_posix())) > 0:
<         raise Exception(f"Folder {output.parent.as_posix()} is not empty, aborting conversion")
---
>     #elif len(listdir(output.parent.as_posix())) > 0:
>     #    raise Exception(f"Folder {output.parent.as_posix()} is not empty, aborting conversion")
374c391,407
< 
---
>         
>     # Save the configuration
>     if save_config:
>         config_path = os.path.splitext(output)[0] + '.json'
> 
>         config = dict(
>             model = nlp.model.config.to_dict(),
>             tokenizer = nlp.tokenizer.init_kwargs
>         )
>         
>         #nlp.model.config.to_json_file(config_path)
>         
>         with open(config_path, 'w') as config_file:
>             json.dump(config, config_file, indent=2)
>             
>         print(f"Saved config to {config_path}")
>         
468a502
>             args.save_config


================================================
FILE: patches/transformers/4.5.1/convert_graph_to_onnx.original.py
================================================
# Copyright 2020 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from argparse import ArgumentParser
from os import listdir, makedirs
from pathlib import Path
from typing import Dict, List, Optional, Tuple

from packaging.version import Version, parse

from transformers.file_utils import ModelOutput, is_tf_available, is_torch_available
from transformers.pipelines import Pipeline, pipeline
from transformers.tokenization_utils import BatchEncoding


# This is the minimal required version to
# support some ONNX Runtime features
ORT_QUANTIZE_MINIMUM_VERSION = parse("1.4.0")


SUPPORTED_PIPELINES = [
    "feature-extraction",
    "ner",
    "sentiment-analysis",
    "fill-mask",
    "question-answering",
    "text-generation",
    "translation_en_to_fr",
    "translation_en_to_de",
    "translation_en_to_ro",
]


class OnnxConverterArgumentParser(ArgumentParser):
    """
    Wraps all the script arguments supported to export transformers models to ONNX IR
    """

    def __init__(self):
        super().__init__("ONNX Converter")

        self.add_argument(
            "--pipeline",
            type=str,
            choices=SUPPORTED_PIPELINES,
            default="feature-extraction",
        )
        self.add_argument(
            "--model",
            type=str,
            required=True,
            help="Model's id or path (ex: bert-base-cased)",
        )
        self.add_argument("--tokenizer", type=str, help="Tokenizer's id or path (ex: bert-base-cased)")
        self.add_argument(
            "--framework",
            type=str,
            choices=["pt", "tf"],
            help="Framework for loading the model",
        )
        self.add_argument("--opset", type=int, default=11, help="ONNX opset to use")
        self.add_argument(
            "--check-loading",
            action="store_true",
            help="Check ONNX is able to load the model",
        )
        self.add_argument(
            "--use-external-format",
            action="store_true",
            help="Allow exporting model >= than 2Gb",
        )
        self.add_argument(
            "--quantize",
            action="store_true",
            help="Quantize the neural network to be run with int8",
        )
        self.add_argument("output")


def generate_identified_filename(filename: Path, identifier: str) -> Path:
    """
    Append a string-identifier at the end (before the extension, if any) to the provided filepath

    Args:
        filename: pathlib.Path The actual path object we would like to add an identifier suffix
        identifier: The suffix to add

    Returns: String with concatenated identifier at the end of the filename
    """
    return filename.parent.joinpath(filename.stem + identifier).with_suffix(filename.suffix)


def check_onnxruntime_requirements(minimum_version: Version):
    """
    Check onnxruntime is installed and if the installed version match is recent enough

    Raises:
        ImportError: If onnxruntime is not installed or too old version is found
    """
    try:
        import onnxruntime

        # Parse the version of the installed onnxruntime
        ort_version = parse(onnxruntime.__version__)

        # We require 1.4.0 minimum
        if ort_version < ORT_QUANTIZE_MINIMUM_VERSION:
            raise ImportError(
                f"We found an older version of onnxruntime ({onnxruntime.__version__}) "
                f"but we require onnxruntime to be >= {minimum_version} to enable all the conversions options.\n"
                f"Please update onnxruntime by running `pip install --upgrade onnxruntime`"
            )

    except ImportError:
        raise ImportError(
            "onnxruntime doesn't seem to be currently installed. "
            "Please install the onnxruntime by running `pip install onnxruntime`"
            " and relaunch the conversion."
        )


def ensure_valid_input(model, tokens, input_names):
    """
    Ensure input are presented in the correct order, without any Non

    Args:
        model: The model used to forward the input data
        tokens: BatchEncoding holding the input data
        input_names: The name of the inputs

    Returns: Tuple

    """
    print("Ensuring inputs are in correct order")

    model_args_name = model.forward.__code__.co_varnames
    model_args, ordered_input_names = [], []
    for arg_name in model_args_name[1:]:  # start at index 1 to skip "self" argument
        if arg_name in input_names:
            ordered_input_names.append(arg_name)
            model_args.append(tokens[arg_name])
        else:
            print(f"{arg_name} is not present in the generated input list.")
            break

    print(f"Generated inputs order: {ordered_input_names}")
    return ordered_input_names, tuple(model_args)


def infer_shapes(nlp: Pipeline, framework: str) -> Tuple[List[str], List[str], Dict, BatchEncoding]:
    """
    Attempt to infer the static vs dynamic axes for each input and output tensors for a specific model

    Args:
        nlp: The pipeline object holding the model to be exported
        framework: The framework identifier to dispatch to the correct inference scheme (pt/tf)

    Returns:

        - List of the inferred input variable names
        - List of the inferred output variable names
        - Dictionary with input/output variables names as key and shape tensor as value
        - a BatchEncoding reference which was used to infer all the above information
    """

    def build_shape_dict(name: str, tensor, is_input: bool, seq_len: int):
        if isinstance(tensor, (tuple, list)):
            return [build_shape_dict(name, t, is_input, seq_len) for t in tensor]

        else:
            # Let's assume batch is the first axis with only 1 element (~~ might not be always true ...)
            axes = {[axis for axis, numel in enumerate(tensor.shape) if numel == 1][0]: "batch"}
            if is_input:
                if len(tensor.shape) == 2:
                    axes[1] = "sequence"
                else:
                    raise ValueError(f"Unable to infer tensor axes ({len(tensor.shape)})")
            else:
                seq_axes = [dim for dim, shape in enumerate(tensor.shape) if shape == seq_len]
                axes.update({dim: "sequence" for dim in seq_axes})

        print(f"Found {'input' if is_input else 'output'} {name} with shape: {axes}")
        return axes

    tokens = nlp.tokenizer("This is a sample output", return_tensors=framework)
    seq_len = tokens.input_ids.shape[-1]
    outputs = nlp.model(**tokens) if framework == "pt" else nlp.model(tokens)
    if isinstance(outputs, ModelOutput):
        outputs = outputs.to_tuple()
    if not isinstance(outputs, (list, tuple)):
        outputs = (outputs,)

    # Generate input names & axes
    input_vars = list(tokens.keys())
    input_dynamic_axes = {k: build_shape_dict(k, v, True, seq_len) for k, v in tokens.items()}

    # flatten potentially grouped outputs (past for gpt2, attentions)
    outputs_flat = []
    for output in outputs:
        if isinstance(output, (tuple, list)):
            outputs_flat.extend(output)
        else:
            outputs_flat.append(output)

    # Generate output names & axes
    output_names = [f"output_{i}" for i in range(len(outputs_flat))]
    output_dynamic_axes = {k: build_shape_dict(k, v, False, seq_len) for k, v in zip(output_names, outputs_flat)}

    # Create the aggregated axes representation
    dynamic_axes = dict(input_dynamic_axes, **output_dynamic_axes)
    return input_vars, output_names, dynamic_axes, tokens


def load_graph_from_args(
    pipeline_name: str, framework: str, model: str, tokenizer: Optional[str] = None, **models_kwargs
) -> Pipeline:
    """
    Convert the set of arguments provided through the CLI to an actual pipeline reference (tokenizer + model

    Args:
        pipeline_name: The kind of pipeline to use (ner, question-answering, etc.)
        framework: The actual model to convert the pipeline from ("pt" or "tf")
        model: The model name which will be loaded by the pipeline
        tokenizer: The tokenizer name which will be loaded by the pipeline, default to the model's value

    Returns: Pipeline object

    """
    # If no tokenizer provided
    if tokenizer is None:
        tokenizer = model

    # Check the wanted framework is available
    if framework == "pt" and not is_torch_available():
        raise Exception("Cannot convert because PyTorch is not installed. Please install torch first.")
    if framework == "tf" and not is_tf_available():
        raise Exception("Cannot convert because TF is not installed. Please install tensorflow first.")

    print(f"Loading pipeline (model: {model}, tokenizer: {tokenizer})")

    # Allocate tokenizer and model
    return pipeline(pipeline_name, model=model, tokenizer=tokenizer, framework=framework, model_kwargs=models_kwargs)


def convert_pytorch(nlp: Pipeline, opset: int, output: Path, use_external_format: bool):
    """
    Export a PyTorch backed pipeline to ONNX Intermediate Representation (IR

    Args:
        nlp: The pipeline to be exported
        opset: The actual version of the ONNX operator set to use
        output: Path where will be stored the generated ONNX model
        use_external_format: Split the model definition from its parameters to allow model bigger than 2GB

    Returns:

    """
    if not is_torch_available():
        raise Exception("Cannot convert because PyTorch is not installed. Please install torch first.")

    import torch
    from torch.onnx import export

    print(f"Using framework PyTorch: {torch.__version__}")

    with torch.no_grad():
        input_names, output_names, dynamic_axes, tokens = infer_shapes(nlp, "pt")
        ordered_input_names, model_args = ensure_valid_input(nlp.model, tokens, input_names)

        export(
            nlp.model,
            model_args,
            f=output.as_posix(),
            input_names=ordered_input_names,
            output_names=output_names,
            dynamic_axes=dynamic_axes,
            do_constant_folding=True,
            use_external_data_format=use_external_format,
            enable_onnx_checker=True,
            opset_version=opset,
        )


def convert_tensorflow(nlp: Pipeline, opset: int, output: Path):
    """
    Export a TensorFlow backed pipeline to ONNX Intermediate Representation (IR

    Args:
        nlp: The pipeline to be exported
        opset: The actual version of the ONNX operator set to use
        output: Path where will be stored the generated ONNX model

    Notes: TensorFlow cannot export model bigger than 2GB due to internal constraint from TensorFlow

    """
    if not is_tf_available():
        raise Exception("Cannot convert because TF is not installed. Please install tensorflow first.")

    print("/!\\ Please note TensorFlow doesn't support exporting model > 2Gb /!\\")

    try:
        import tensorflow as tf

        from keras2onnx import __version__ as k2ov
        from keras2onnx import convert_keras, save_model

        print(f"Using framework TensorFlow: {tf.version.VERSION}, keras2onnx: {k2ov}")

        # Build
        input_names, output_names, dynamic_axes, tokens = infer_shapes(nlp, "tf")

        # Forward
        nlp.model.predict(tokens.data)
        onnx_model = convert_keras(nlp.model, nlp.model.name, target_opset=opset)
        save_model(onnx_model, output.as_posix())

    except ImportError as e:
        raise Exception(f"Cannot import {e.name} required to convert TF model to ONNX. Please install {e.name} first.")


def convert(
    framework: str,
    model: str,
    output: Path,
    opset: int,
    tokenizer: Optional[str] = None,
    use_external_format: bool = False,
    pipeline_name: str = "feature-extraction",
    **model_kwargs
):
    """
    Convert the pipeline object to the ONNX Intermediate Representation (IR) format

    Args:
        framework: The framework the pipeline is backed by ("pt" or "tf")
        model: The name of the model to load for the pipeline
        output: The path where the ONNX graph will be stored
        opset: The actual version of the ONNX operator set to use
        tokenizer: The name of the model to load for the pipeline, default to the model's name if not provided
        use_external_format: Split the model definition from its parameters to allow model bigger than 2GB (PyTorch only)
        pipeline_name: The kind of pipeline to instantiate (ner, question-answering, etc.)
        model_kwargs: Keyword arguments to be forwarded to the model constructor

    Returns:

    """
    print(f"ONNX opset version set to: {opset}")

    # Load the pipeline
    nlp = load_graph_from_args(pipeline_name, framework, model, tokenizer, **model_kwargs)

    if not output.parent.exists():
        print(f"Creating folder {output.parent}")
        makedirs(output.parent.as_posix())
    elif len(listdir(output.parent.as_posix())) > 0:
        raise Exception(f"Folder {output.parent.as_posix()} is not empty, aborting conversion")

    # Export the graph
    if framework == "pt":
        convert_pytorch(nlp, opset, output, use_external_format)
    else:
        convert_tensorflow(nlp, opset, output)


def optimize(onnx_model_path: Path) -> Path:
    """
    Load the model at the specified path and let onnxruntime look at transformations on the graph to enable all the
    optimizations possibl

    Args:
        onnx_model_path: filepath where the model binary description is stored

    Returns: Path where the optimized model binary description has been saved

    """
    from onnxruntime import InferenceSession, SessionOptions

    # Generate model name with suffix "optimized"
    opt_model_path = generate_identified_filename(onnx_model_path, "-optimized")
    sess_option = SessionOptions()
    sess_option.optimized_model_filepath = opt_model_path.as_posix()
    _ = InferenceSession(onnx_model_path.as_posix(), sess_option)

    print(f"Optimized model has been written at {opt_model_path}: \N{heavy check mark}")
    print("/!\\ Optimized model contains hardware specific operators which might not be portable. /!\\")

    return opt_model_path


def quantize(onnx_model_path: Path) -> Path:
    """
    Quantize the weights of the model from float32 to in8 to allow very efficient inference on modern CPU

    Args:
        onnx_model_path: Path to location the exported ONNX model is stored

    Returns: The Path generated for the quantized
    """
    import onnx
    from onnxruntime.quantization import QuantizationMode, quantize

    onnx_model = onnx.load(onnx_model_path.as_posix())

    # Discussed with @yufenglee from ONNX runtime, this will be address in the next release of onnxruntime
    print(
        "As of onnxruntime 1.4.0, models larger than 2GB will fail to quantize due to protobuf constraint.\n"
        "This limitation will be removed in the next release of onnxruntime."
    )

    quantized_model = quantize(
        model=onnx_model,
        quantization_mode=QuantizationMode.IntegerOps,
        force_fusions=True,
        symmetric_weight=True,
    )

    # Append "-quantized" at the end of the model's name
    quantized_model_path = generate_identified_filename(onnx_model_path, "-quantized")

    # Save model
    print(f"Quantized model has been written at {quantized_model_path}: \N{heavy check mark}")
    onnx.save_model(quantized_model, quantized_model_path.as_posix())

    return quantized_model_path


def verify(path: Path):
    from onnxruntime import InferenceSession, SessionOptions
    from onnxruntime.capi.onnxruntime_pybind11_state import RuntimeException

    print(f"Checking ONNX model loading from: {path} ...")
    try:
        onnx_options = SessionOptions()
        _ = InferenceSession(path.as_posix(), onnx_options, providers=["CPUExecutionProvider"])
        print(f"Model {path} correctly loaded: \N{heavy check mark}")
    except RuntimeException as re:
        print(f"Error while loading the model {re}: \N{heavy ballot x}")


if __name__ == "__main__":
    parser = OnnxConverterArgumentParser()
    args = parser.parse_args()

    # Make sure output is absolute path
    args.output = Path(args.output).absolute()

    try:
        print("\n====== Converting model to ONNX ======")
        # Convert
        convert(
            args.framework,
            args.model,
            args.output,
            args.opset,
            args.tokenizer,
            args.use_external_format,
            args.pipeline,
        )

        if args.quantize:
            # Ensure requirements for quantization on onnxruntime is met
            check_onnxruntime_requirements(ORT_QUANTIZE_MINIMUM_VERSION)

            # onnxruntime optimizations doesn't provide the same level of performances on TensorFlow than PyTorch
            if args.framework == "tf":
                print(
                    "\t Using TensorFlow might not provide the same optimization level compared to PyTorch.\n"
                    "\t For TensorFlow users you can try optimizing the model directly through onnxruntime_tools.\n"
                    "\t For more information, please refer to the onnxruntime documentation:\n"
                    "\t\thttps://github.com/microsoft/onnxruntime/tree/master/onnxruntime/python/tools/transformers\n"
                )

            print("\n====== Optimizing ONNX model ======")

            # Quantization works best when using the optimized version of the model
            args.optimized_output = optimize(args.output)

            # Do the quantization on the right graph
            args.quantized_output = quantize(args.optimized_output)

        # And verify
        if args.check_loading:
            print("\n====== Check exported ONNX model(s) ======")
            verify(args.output)

            if hasattr(args, "optimized_output"):
                verify(args.optimized_output)

            if hasattr(args, "quantized_output"):
                verify(args.quantized_output)

    except Exception as e:
        print(f"Error while converting the model: {e}")
        exit(1)


================================================
FILE: patches/transformers/4.5.1/convert_graph_to_onnx.py
================================================
# Copyright 2020 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os 
import json

from argparse import ArgumentParser
from os import listdir, makedirs
from pathlib import Path
from typing import Dict, List, Optional, Tuple

from packaging.version import Version, parse

from transformers.file_utils import ModelOutput, is_tf_available, is_torch_available
from transformers.pipelines import Pipeline, pipeline
from transformers.tokenization_utils import BatchEncoding


# This is the minimal required version to
# support some ONNX Runtime features
ORT_QUANTIZE_MINIMUM_VERSION = parse("1.4.0")


SUPPORTED_PIPELINES = [
    "feature-extraction",
    "ner",
    "sentiment-analysis",
    "fill-mask",
    "question-answering",
    "text-generation",
    "translation_en_to_fr",
    "translation_en_to_de",
    "translation_en_to_ro",
]


class OnnxConverterArgumentParser(ArgumentParser):
    """
    Wraps all the script arguments supported to export transformers models to ONNX IR
    """

    def __init__(self):
        super().__init__("ONNX Converter")

        self.add_argument(
            "--pipeline",
            type=str,
            choices=SUPPORTED_PIPELINES,
            default="feature-extraction",
        )
        self.add_argument(
            "--model",
            type=str,
            required=True,
            help="Model's id or path (ex: bert-base-cased)",
        )
        self.add_argument("--tokenizer", type=str, help="Tokenizer's id or path (ex: bert-base-cased)")
        self.add_argument(
            "--framework",
            type=str,
            choices=["pt", "tf"],
            help="Framework for loading the model",
        )
        self.add_argument("--opset", type=int, default=11, help="ONNX opset to use")
        self.add_argument(
            "--check-loading",
            action="store_true",
            help="Check ONNX is able to load the model",
        )
        self.add_argument(
            "--use-external-format",
            action="store_true",
            help="Allow exporting model >= than 2Gb",
        )
        self.add_argument(
            "--save-config",
            action="store_true",
            help="Save the model configuration along with the ONNX",
        )
        self.add_argument(
            "--quantize",
            action="store_true",
            help="Quantize the neural network to be run with int8",
        )
        self.add_argument("output")


def generate_identified_filename(filename: Path, identifier: str) -> Path:
    """
    Append a string-identifier at the end (before the extension, if any) to the provided filepath

    Args:
        filename: pathlib.Path The actual path object we would like to add an identifier suffix
        identifier: The suffix to add

    Returns: String with concatenated identifier at the end of the filename
    """
    return filename.parent.joinpath(filename.stem + identifier).with_suffix(filename.suffix)


def check_onnxruntime_requirements(minimum_version: Version):
    """
    Check onnxruntime is installed and if the installed version match is recent enough

    Raises:
        ImportError: If onnxruntime is not installed or too old version is found
    """
    try:
        import onnxruntime

        # Parse the version of the installed onnxruntime
        ort_version = parse(onnxruntime.__version__)

        # We require 1.4.0 minimum
        if ort_version < ORT_QUANTIZE_MINIMUM_VERSION:
            raise ImportError(
                f"We found an older version of onnxruntime ({onnxruntime.__version__}) "
                f"but we require onnxruntime to be >= {minimum_version} to enable all the conversions options.\n"
                f"Please update onnxruntime by running `pip install --upgrade onnxruntime`"
            )

    except ImportError:
        raise ImportError(
            "onnxruntime doesn't seem to be currently installed. "
            "Please install the onnxruntime by running `pip install onnxruntime`"
            " and relaunch the conversion."
        )


def ensure_valid_input(model, tokens, input_names):
    """
    Ensure input are presented in the correct order, without any Non

    Args:
        model: The model used to forward the input data
        tokens: BatchEncoding holding the input data
        input_names: The name of the inputs

    Returns: Tuple

    """
    print("Ensuring inputs are in correct order")

    model_args_name = model.forward.__code__.co_varnames
    model_args, ordered_input_names = [], []
    for arg_name in model_args_name[1:]:  # start at index 1 to skip "self" argument
        if arg_name in input_names:
            ordered_input_names.append(arg_name)
            model_args.append(tokens[arg_name])
        else:
            print(f"{arg_name} is not present in the generated input list.")
            break

    print(f"Generated inputs order: {ordered_input_names}")
    return ordered_input_names, tuple(model_args)


def infer_shapes(nlp: Pipeline, framework: str) -> Tuple[List[str], List[str], Dict, BatchEncoding]:
    """
    Attempt to infer the static vs dynamic axes for each input and output tensors for a specific model

    Args:
        nlp: The pipeline object holding the model to be exported
        framework: The framework identifier to dispatch to the correct inference scheme (pt/tf)

    Returns:

        - List of the inferred input variable names
        - List of the inferred output variable names
        - Dictionary with input/output variables names as key and shape tensor as value
        - a BatchEncoding reference which was used to infer all the above information
    """

    def build_shape_dict(name: str, tensor, is_input: bool, seq_len: int):
        if isinstance(tensor, (tuple, list)):
            return [build_shape_dict(name, t, is_input, seq_len) for t in tensor]

        else:
            # Let's assume batch is the first axis with only 1 element (~~ might not be always true ...)
            axes = {[axis for axis, numel in enumerate(tensor.shape) if numel == 1][0]: "batch"}
            if is_input:
                if len(tensor.shape) == 2:
                    axes[1] = "sequence"
                else:
                    raise ValueError(f"Unable to infer tensor axes ({len(tensor.shape)})")
            else:
                seq_axes = [dim for dim, shape in enumerate(tensor.shape) if shape == seq_len]
                axes.update({dim: "sequence" for dim in seq_axes})

        print(f"Found {'input' if is_input else 'output'} {name} with shape: {axes}")
        return axes

    tokens = nlp.tokenizer("This is a sample output", return_tensors=framework)
    seq_len = tokens.input_ids.shape[-1]
    outputs = nlp.model(**tokens) if framework == "pt" else nlp.model(tokens)
    if isinstance(outputs, ModelOutput):
        outputs = outputs.to_tuple()
    if not isinstance(outputs, (list, tuple)):
        outputs = (outputs,)

    # Generate input names & axes
    input_vars = list(tokens.keys())
    input_dynamic_axes = {k: build_shape_dict(k, v, True, seq_len) for k, v in tokens.items()}

    # flatten potentially grouped outputs (past for gpt2, attentions)
    outputs_flat = []
    for output in outputs:
        if isinstance(output, (tuple, list)):
            outputs_flat.extend(output)
        else:
            outputs_flat.append(output)

    # Generate output names & axes
    output_names = [f"output_{i}" for i in range(len(outputs_flat))]
    output_dynamic_axes = {k: build_shape_dict(k, v, False, seq_len) for k, v in zip(output_names, outputs_flat)}

    # Create the aggregated axes representation
    dynamic_axes = dict(input_dynamic_axes, **output_dynamic_axes)
    return input_vars, output_names, dynamic_axes, tokens


def load_graph_from_args(
    pipeline_name: str, framework: str, model: str, tokenizer: Optional[str] = None, **models_kwargs
) -> Pipeline:
    """
    Convert the set of arguments provided through the CLI to an actual pipeline reference (tokenizer + model

    Args:
        pipeline_name: The kind of pipeline to use (ner, question-answering, etc.)
        framework: The actual model to convert the pipeline from ("pt" or "tf")
        model: The model name which will be loaded by the pipeline
        tokenizer: The tokenizer name which will be loaded by the pipeline, default to the model's value

    Returns: Pipeline object

    """
    # If no tokenizer provided
    if tokenizer is None:
        tokenizer = model

    # Check the wanted framework is available
    if framework == "pt" and not is_torch_available():
        raise Exception("Cannot convert because PyTorch is not installed. Please install torch first.")
    if framework == "tf" and not is_tf_available():
        raise Exception("Cannot convert because TF is not installed. Please install tensorflow first.")

    print(f"Loading pipeline (model: {model}, tokenizer: {tokenizer})")

    # Allocate tokenizer and model
    return pipeline(pipeline_name, model=model, tokenizer=tokenizer, framework=framework, model_kwargs=models_kwargs)


def convert_pytorch(nlp: Pipeline, opset: int, output: Path, use_external_format: bool):
    """
    Export a PyTorch backed pipeline to ONNX Intermediate Representation (IR

    Args:
        nlp: The pipeline to be exported
        opset: The actual version of the ONNX operator set to use
        output: Path where will be stored the generated ONNX model
        use_external_format: Split the model definition from its parameters to allow model bigger than 2GB

    Returns:

    """
    if not is_torch_available():
        raise Exception("Cannot convert because PyTorch is not installed. Please install torch first.")

    import torch
    from torch.onnx import export

    print(f"Using framework PyTorch: {torch.__version__}")

    with torch.no_grad():
        input_names, output_names, dynamic_axes, tokens = infer_shapes(nlp, "pt")
        ordered_input_names, model_args = ensure_valid_input(nlp.model, tokens, input_names)

        print('Exporting from PyTorch to ONNX...')
        print('input_names', input_names)
        print('output_names', output_names)
        print('dynamic_axes', dynamic_axes)
        print('tokens', tokens)
        print('model_args', model_args)
        
        export(
            nlp.model,
            model_args,
            f=output.as_posix(),
            input_names=ordered_input_names,
            output_names=output_names,
            dynamic_axes=dynamic_axes,
            do_constant_folding=True,
            use_external_data_format=use_external_format,
            enable_onnx_checker=True,
            opset_version=opset,
            verbose=True
        )


def convert_tensorflow(nlp: Pipeline, opset: int, output: Path):
    """
    Export a TensorFlow backed pipeline to ONNX Intermediate Representation (IR

    Args:
        nlp: The pipeline to be exported
        opset: The actual version of the ONNX operator set to use
        output: Path where will be stored the generated ONNX model

    Notes: TensorFlow cannot export model bigger than 2GB due to internal constraint from TensorFlow

    """
    if not is_tf_available():
        raise Exception("Cannot convert because TF is not installed. Please install tensorflow first.")

    print("/!\\ Please note TensorFlow doesn't support exporting model > 2Gb /!\\")

    try:
        import tensorflow as tf

        from keras2onnx import __version__ as k2ov
        from keras2onnx import convert_keras, save_model

        print(f"Using framework TensorFlow: {tf.version.VERSION}, keras2onnx: {k2ov}")

        # Build
        input_names, output_names, dynamic_axes, tokens = infer_shapes(nlp, "tf")

        # Forward
        nlp.model.predict(tokens.data)
        onnx_model = convert_keras(nlp.model, nlp.model.name, target_opset=opset)
        save_model(onnx_model, output.as_posix())

    except ImportError as e:
        raise Exception(f"Cannot import {e.name} required to convert TF model to ONNX. Please install {e.name} first.")


def convert(
    framework: str,
    model: str,
    output: Path,
    opset: int,
    tokenizer: Optional[str] = None,
    use_external_format: bool = False,
    pipeline_name: str = "feature-extraction",
    save_config: bool = False,
    **model_kwargs
):
    """
    Convert the pipeline object to the ONNX Intermediate Representation (IR) format

    Args:
        framework: The framework the pipeline is backed by ("pt" or "tf")
        model: The name of the model to load for the pipeline
        output: The path where the ONNX graph will be stored
        opset: The actual version of the ONNX operator set to use
        tokenizer: The name of the model to load for the pipeline, default to the model's name if not provided
        use_external_format: Split the model definition from its parameters to allow model bigger than 2GB (PyTorch only)
        pipeline_name: The kind of pipeline to instantiate (ner, question-answering, etc.)
        model_kwargs: Keyword arguments to be forwarded to the model constructor

    Returns:

    """
    print(f"ONNX opset version set to: {opset}")

    # Load the pipeline
    nlp = load_graph_from_args(pipeline_name, framework, model, tokenizer, **model_kwargs)

    if not output.parent.exists():
        print(f"Creating folder {output.parent}")
        makedirs(output.parent.as_posix())
    #elif len(listdir(output.parent.as_posix())) > 0:
    #    raise Exception(f"Folder {output.parent.as_posix()} is not empty, aborting conversion")

    # Export the graph
    if framework == "pt":
        convert_pytorch(nlp, opset, output, use_external_format)
    else:
        convert_tensorflow(nlp, opset, output)
        
    # Save the configuration
    if save_config:
        config_path = os.path.splitext(output)[0] + '.json'

        config = dict(
            model = nlp.model.config.to_dict(),
            tokenizer = nlp.tokenizer.init_kwargs
        )
        
        #nlp.model.config.to_json_file(config_path)
        
        with open(config_path, 'w') as config_file:
            json.dump(config, config_file, indent=2)
            
        print(f"Saved config to {config_path}")
        

def optimize(onnx_model_path: Path) -> Path:
    """
    Load the model at the specified path and let onnxruntime look at transformations on the graph to enable all the
    optimizations possibl

    Args:
        onnx_model_path: filepath where the model binary description is stored

    Returns: Path where the optimized model binary description has been saved

    """
    from onnxruntime import InferenceSession, SessionOptions

    # Generate model name with suffix "optimized"
    opt_model_path = generate_identified_filename(onnx_model_path, "-optimized")
    sess_option = SessionOptions()
    sess_option.optimized_model_filepath = opt_model_path.as_posix()
    _ = InferenceSession(onnx_model_path.as_posix(), sess_option)

    print(f"Optimized model has been written at {opt_model_path}: \N{heavy check mark}")
    print("/!\\ Optimized model contains hardware specific operators which might not be portable. /!\\")

    return opt_model_path


def quantize(onnx_model_path: Path) -> Path:
    """
    Quantize the weights of the model from float32 to in8 to allow very efficient inference on modern CPU

    Args:
        onnx_model_path: Path to location the exported ONNX model is stored

    Returns: The Path generated for the quantized
    """
    import onnx
    from onnxruntime.quantization import QuantizationMode, quantize

    onnx_model = onnx.load(onnx_model_path.as_posix())

    # Discussed with @yufenglee from ONNX runtime, this will be address in the next release of onnxruntime
    print(
        "As of onnxruntime 1.4.0, models larger than 2GB will fail to quantize due to protobuf constraint.\n"
        "This limitation will be removed in the next release of onnxruntime."
    )

    quantized_model = quantize(
        model=onnx_model,
        quantization_mode=QuantizationMode.IntegerOps,
        force_fusions=True,
        symmetric_weight=True,
    )

    # Append "-quantized" at the end of the model's name
    quantized_model_path = generate_identified_filename(onnx_model_path, "-quantized")

    # Save model
    print(f"Quantized model has been written at {quantized_model_path}: \N{heavy check mark}")
    onnx.save_model(quantized_model, quantized_model_path.as_posix())

    return quantized_model_path


def verify(path: Path):
    from onnxruntime import InferenceSession, SessionOptions
    from onnxruntime.capi.onnxruntime_pybind11_state import RuntimeException

    print(f"Checking ONNX model loading from: {path} ...")
    try:
        onnx_options = SessionOptions()
        _ = InferenceSession(path.as_posix(), onnx_options, providers=["CPUExecutionProvider"])
        print(f"Model {path} correctly loaded: \N{heavy check mark}")
    except RuntimeException as re:
        print(f"Error while loading the model {re}: \N{heavy ballot x}")


if __name__ == "__main__":
    parser = OnnxConverterArgumentParser()
    args = parser.parse_args()

    # Make sure output is absolute path
    args.output = Path(args.output).absolute()

    try:
        print("\n====== Converting model to ONNX ======")
        # Convert
        convert(
            args.framework,
            args.model,
            args.output,
            args.opset,
            args.tokenizer,
            args.use_external_format,
            args.pipeline,
            args.save_config
        )

        if args.quantize:
            # Ensure requirements for quantization on onnxruntime is met
            check_onnxruntime_requirements(ORT_QUANTIZE_MINIMUM_VERSION)

            # onnxruntime optimizations doesn't provide the same level of performances on TensorFlow than PyTorch
            if args.framework == "tf":
                print(
                    "\t Using TensorFlow might not provide the same optimization level compared to PyTorch.\n"
                    "\t For TensorFlow users you can try optimizing the model directly through onnxruntime_tools.\n"
                    "\t For more information, please refer to the onnxruntime documentation:\n"
                    "\t\thttps://github.com/microsoft/onnxruntime/tree/master/onnxruntime/python/tools/transformers\n"
                )

            print("\n====== Optimizing ONNX model ======")

            # Quantization works best when using the optimized version of the model
            args.optimized_output = optimize(args.output)

            # Do the quantization on the right graph
            args.quantized_output = quantize(args.optimized_output)

        # And verify
        if args.check_loading:
            print("\n====== Check exported ONNX model(s) ======")
            verify(args.output)

            if hasattr(args, "optimized_output"):
                verify(args.optimized_output)

            if hasattr(args, "quantized_output"):
                verify(args.quantized_output)

    except Exception as e:
        print(f"Error while converting the model: {e}")
        exit(1)


================================================
FILE: patches/transformers/4.5.1/modeling_distilbert.diff
================================================
183,184c183,184
<         mask = (mask == 0).view(mask_reshp).expand_as(scores)  # (bs, n_heads, q_length, k_length)
<         scores.masked_fill_(mask, -float("inf"))  # (bs, n_heads, q_length, k_length)
---
>         mask = mask.view(mask_reshp).expand_as(scores)  # (bs, n_heads, q_length, k_length)
>         scores.masked_fill_((mask == 0), -float("inf"))  # (bs, n_heads, q_length, k_length)


================================================
FILE: patches/transformers/4.5.1/modeling_distilbert.original.py
================================================
# coding=utf-8
# Copyright 2019-present, the HuggingFace Inc. team, The Google AI Language Team and Facebook, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
 PyTorch DistilBERT model adapted in part from Facebook, Inc XLM model (https://github.com/facebookresearch/XLM) and in
 part from HuggingFace PyTorch version of Google AI Bert model (https://github.com/google-research/bert)
"""


import copy
import math

import numpy as np
import torch
import torch.nn as nn
from torch.nn import CrossEntropyLoss

from ...activations import gelu
from ...file_utils import (
    add_code_sample_docstrings,
    add_start_docstrings,
    add_start_docstrings_to_model_forward,
    replace_return_docstrings,
)
from ...modeling_outputs import (
    BaseModelOutput,
    MaskedLMOutput,
    MultipleChoiceModelOutput,
    QuestionAnsweringModelOutput,
    SequenceClassifierOutput,
    TokenClassifierOutput,
)
from ...modeling_utils import (
    PreTrainedModel,
    apply_chunking_to_forward,
    find_pruneable_heads_and_indices,
    prune_linear_layer,
)
from ...utils import logging
from .configuration_distilbert import DistilBertConfig


logger = logging.get_logger(__name__)
_CHECKPOINT_FOR_DOC = "distilbert-base-uncased"
_CONFIG_FOR_DOC = "DistilBertConfig"
_TOKENIZER_FOR_DOC = "DistilBertTokenizer"

DISTILBERT_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "distilbert-base-uncased",
    "distilbert-base-uncased-distilled-squad",
    "distilbert-base-cased",
    "distilbert-base-cased-distilled-squad",
    "distilbert-base-german-cased",
    "distilbert-base-multilingual-cased",
    "distilbert-base-uncased-finetuned-sst-2-english",
    # See all DistilBERT models at https://huggingface.co/models?filter=distilbert
]


# UTILS AND BUILDING BLOCKS OF THE ARCHITECTURE #


def create_sinusoidal_embeddings(n_pos, dim, out):
    position_enc = np.array([[pos / np.power(10000, 2 * (j // 2) / dim) for j in range(dim)] for pos in range(n_pos)])
    out.requires_grad = False
    out[:, 0::2] = torch.FloatTensor(np.sin(position_enc[:, 0::2]))
    out[:, 1::2] = torch.FloatTensor(np.cos(position_enc[:, 1::2]))
    out.detach_()


class Embeddings(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.word_embeddings = nn.Embedding(config.vocab_size, config.dim, padding_idx=config.pad_token_id)
        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.dim)
        if config.sinusoidal_pos_embds:
            create_sinusoidal_embeddings(
                n_pos=config.max_position_embeddings, dim=config.dim, out=self.position_embeddings.weight
            )

        self.LayerNorm = nn.LayerNorm(config.dim, eps=1e-12)
        self.dropout = nn.Dropout(config.dropout)

    def forward(self, input_ids):
        """
        Parameters:
            input_ids: torch.tensor(bs, max_seq_length) The token ids to embed.

        Returns: torch.tensor(bs, max_seq_length, dim) The embedded tokens (plus position embeddings, no token_type
        embeddings)
        """
        seq_length = input_ids.size(1)
        position_ids = torch.arange(seq_length, dtype=torch.long, device=input_ids.device)  # (max_seq_length)
        position_ids = position_ids.unsqueeze(0).expand_as(input_ids)  # (bs, max_seq_length)

        word_embeddings = self.word_embeddings(input_ids)  # (bs, max_seq_length, dim)
        position_embeddings = self.position_embeddings(position_ids)  # (bs, max_seq_length, dim)

        embeddings = word_embeddings + position_embeddings  # (bs, max_seq_length, dim)
        embeddings = self.LayerNorm(embeddings)  # (bs, max_seq_length, dim)
        embeddings = self.dropout(embeddings)  # (bs, max_seq_length, dim)
        return embeddings


class MultiHeadSelfAttention(nn.Module):
    def __init__(self, config):
        super().__init__()

        self.n_heads = config.n_heads
        self.dim = config.dim
        self.dropout = nn.Dropout(p=config.attention_dropout)

        assert self.dim % self.n_heads == 0

        self.q_lin = nn.Linear(in_features=config.dim, out_features=config.dim)
        self.k_lin = nn.Linear(in_features=config.dim, out_features=config.dim)
        self.v_lin = nn.Linear(in_features=config.dim, out_features=config.dim)
        self.out_lin = nn.Linear(in_features=config.dim, out_features=config.dim)

        self.pruned_heads = set()

    def prune_heads(self, heads):
        attention_head_size = self.dim // self.n_heads
        if len(heads) == 0:
            return
        heads, index = find_pruneable_heads_and_indices(heads, self.n_heads, attention_head_size, self.pruned_heads)
        # Prune linear layers
        self.q_lin = prune_linear_layer(self.q_lin, index)
        self.k_lin = prune_linear_layer(self.k_lin, index)
        self.v_lin = prune_linear_layer(self.v_lin, index)
        self.out_lin = prune_linear_layer(self.out_lin, index, dim=1)
        # Update hyper params
        self.n_heads = self.n_heads - len(heads)
        self.dim = attention_head_size * self.n_heads
        self.pruned_heads = self.pruned_heads.union(heads)

    def forward(self, query, key, value, mask, head_mask=None, output_attentions=False):
        """
        Parameters:
            query: torch.tensor(bs, seq_length, dim)
            key: torch.tensor(bs, seq_length, dim)
            value: torch.tensor(bs, seq_length, dim)
            mask: torch.tensor(bs, seq_length)

        Returns:
            weights: torch.tensor(bs, n_heads, seq_length, seq_length) Attention weights context: torch.tensor(bs,
            seq_length, dim) Contextualized layer. Optional: only if `output_attentions=True`
        """
        bs, q_length, dim = query.size()
        k_length = key.size(1)
        # assert dim == self.dim, f'Dimensions do not match: {dim} input vs {self.dim} configured'
        # assert key.size() == value.size()

        dim_per_head = self.dim // self.n_heads

        mask_reshp = (bs, 1, 1, k_length)

        def shape(x):
            """ separate heads """
            return x.view(bs, -1, self.n_heads, dim_per_head).transpose(1, 2)

        def unshape(x):
            """ group heads """
            return x.transpose(1, 2).contiguous().view(bs, -1, self.n_heads * dim_per_head)

        q = shape(self.q_lin(query))  # (bs, n_heads, q_length, dim_per_head)
        k = shape(self.k_lin(key))  # (bs, n_heads, k_length, dim_per_head)
        v = shape(self.v_lin(value))  # (bs, n_heads, k_length, dim_per_head)

        q = q / math.sqrt(dim_per_head)  # (bs, n_heads, q_length, dim_per_head)
        scores = torch.matmul(q, k.transpose(2, 3))  # (bs, n_heads, q_length, k_length)
        mask = (mask == 0).view(mask_reshp).expand_as(scores)  # (bs, n_heads, q_length, k_length)
        scores.masked_fill_(mask, -float("inf"))  # (bs, n_heads, q_length, k_length)

        weights = nn.Softmax(dim=-1)(scores)  # (bs, n_heads, q_length, k_length)
        weights = self.dropout(weights)  # (bs, n_heads, q_length, k_length)

        # Mask heads if we want to
        if head_mask is not None:
            weights = weights * head_mask

        context = torch.matmul(weights, v)  # (bs, n_heads, q_length, dim_per_head)
        context = unshape(context)  # (bs, q_length, dim)
        context = self.out_lin(context)  # (bs, q_length, dim)

        if output_attentions:
            return (context, weights)
        else:
            return (context,)


class FFN(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.dropout = nn.Dropout(p=config.dropout)
        self.chunk_size_feed_forward = config.chunk_size_feed_forward
        self.seq_len_dim = 1
        self.lin1 = nn.Linear(in_features=config.dim, out_features=config.hidden_dim)
        self.lin2 = nn.Linear(in_features=config.hidden_dim, out_features=config.dim)
        assert config.activation in ["relu", "gelu"], f"activation ({config.activation}) must be in ['relu', 'gelu']"
        self.activation = gelu if config.activation == "gelu" else nn.ReLU()

    def forward(self, input):
        return apply_chunking_to_forward(self.ff_chunk, self.chunk_size_feed_forward, self.seq_len_dim, input)

    def ff_chunk(self, input):
        x = self.lin1(input)
        x = self.activation(x)
        x = self.lin2(x)
        x = self.dropout(x)
        return x


class TransformerBlock(nn.Module):
    def __init__(self, config):
        super().__init__()

        assert config.dim % config.n_heads == 0

        self.attention = MultiHeadSelfAttention(config)
        self.sa_layer_norm = nn.LayerNorm(normalized_shape=config.dim, eps=1e-12)

        self.ffn = FFN(config)
        self.output_layer_norm = nn.LayerNorm(normalized_shape=config.dim, eps=1e-12)

    def forward(self, x, attn_mask=None, head_mask=None, output_attentions=False):
        """
        Parameters:
            x: torch.tensor(bs, seq_length, dim)
            attn_mask: torch.tensor(bs, seq_length)

        Returns:
            sa_weights: torch.tensor(bs, n_heads, seq_length, seq_length) The attention weights ffn_output:
            torch.tensor(bs, seq_length, dim) The output of the transformer block contextualization.
        """
        # Self-Attention
        sa_output = self.attention(
            query=x,
            key=x,
            value=x,
            mask=attn_mask,
            head_mask=head_mask,
            output_attentions=output_attentions,
        )
        if output_attentions:
            sa_output, sa_weights = sa_output  # (bs, seq_length, dim), (bs, n_heads, seq_length, seq_length)
        else:  # To handle these `output_attentions` or `output_hidden_states` cases returning tuples
            assert type(sa_output) == tuple
            sa_output = sa_output[0]
        sa_output = self.sa_layer_norm(sa_output + x)  # (bs, seq_length, dim)

        # Feed Forward Network
        ffn_output = self.ffn(sa_output)  # (bs, seq_length, dim)
        ffn_output = self.output_layer_norm(ffn_output + sa_output)  # (bs, seq_length, dim)

        output = (ffn_output,)
        if output_attentions:
            output = (sa_weights,) + output
        return output


class Transformer(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.n_layers = config.n_layers

        layer = TransformerBlock(config)
        self.layer = nn.ModuleList([copy.deepcopy(layer) for _ in range(config.n_layers)])

    def forward(
        self, x, attn_mask=None, head_mask=None, output_attentions=False, output_hidden_states=False, return_dict=None
    ):  # docstyle-ignore
        """
        Parameters:
            x: torch.tensor(bs, seq_length, dim) Input sequence embedded.
            attn_mask: torch.tensor(bs, seq_length) Attention mask on the sequence.

        Returns:
            hidden_state: torch.tensor(bs, seq_length, dim) Sequence of hidden states in the last (top)
            layer all_hidden_states: Tuple[torch.tensor(bs, seq_length, dim)]
                Tuple of length n_layers with the hidden states from each layer.
                Optional: only if output_hidden_states=True
            all_attentions: Tuple[torch.tensor(bs, n_heads, seq_length, seq_length)]
                Tuple of length n_layers with the attention weights from each layer
                Optional: only if output_attentions=True
        """
        all_hidden_states = () if output_hidden_states else None
        all_attentions = () if output_attentions else None

        hidden_state = x
        for i, layer_module in enumerate(self.layer):
            if output_hidden_states:
                all_hidden_states = all_hidden_states + (hidden_state,)

            layer_outputs = layer_module(
                x=hidden_state, attn_mask=attn_mask, head_mask=head_mask[i], output_attentions=output_attentions
            )
            hidden_state = layer_outputs[-1]

            if output_attentions:
                assert len(layer_outputs) == 2
                attentions = layer_outputs[0]
                all_attentions = all_attentions + (attentions,)
            else:
                assert len(layer_outputs) == 1

        # Add last layer
        if output_hidden_states:
            all_hidden_states = all_hidden_states + (hidden_state,)

        if not return_dict:
            return tuple(v for v in [hidden_state, all_hidden_states, all_attentions] if v is not None)
        return BaseModelOutput(
            last_hidden_state=hidden_state, hidden_states=all_hidden_states, attentions=all_attentions
        )


# INTERFACE FOR ENCODER AND TASK SPECIFIC MODEL #
class DistilBertPreTrainedModel(PreTrainedModel):
    """
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    """

    config_class = DistilBertConfig
    load_tf_weights = None
    base_model_prefix = "distilbert"

    def _init_weights(self, module):
        """Initialize the weights."""
        if isinstance(module, nn.Linear):
            # Slightly different from the TF version which uses truncated_normal for initialization
            # cf https://github.com/pytorch/pytorch/pull/5617
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)


DISTILBERT_START_DOCSTRING = r"""

    This model inherits from :class:`~transformers.PreTrainedModel`. Check the superclass documentation for the generic
    methods the library implements for all its model (such as downloading or saving, resizing the input embeddings,
    pruning heads etc.)

    This model is also a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`__
    subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to
    general usage and behavior.

    Parameters:
        config (:class:`~transformers.DistilBertConfig`): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model
            weights.
"""

DISTILBERT_INPUTS_DOCSTRING = r"""
    Args:
        input_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using :class:`~transformers.DistilBertTokenizer`. See
            :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
            details.

            `What are input IDs? <../glossary.html#input-ids>`__
        attention_mask (:obj:`torch.FloatTensor` of shape :obj:`({0})`, `optional`):
            Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            `What are attention masks? <../glossary.html#attention-mask>`__
        head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`):
            Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.

        inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`({0}, hidden_size)`, `optional`):
            Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
            This is useful if you want more control over how to convert :obj:`input_ids` indices into associated
            vectors than the model's internal embedding lookup matrix.
        output_attentions (:obj:`bool`, `optional`):
            Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
            tensors for more detail.
        output_hidden_states (:obj:`bool`, `optional`):
            Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
            more detail.
        return_dict (:obj:`bool`, `optional`):
            Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
"""


@add_start_docstrings(
    "The bare DistilBERT encoder/transformer outputting raw hidden-states without any specific head on top.",
    DISTILBERT_START_DOCSTRING,
)
class DistilBertModel(DistilBertPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)

        self.embeddings = Embeddings(config)  # Embeddings
        self.transformer = Transformer(config)  # Encoder

        self.init_weights()

    def get_input_embeddings(self):
        return self.embeddings.word_embeddings

    def set_input_embeddings(self, new_embeddings):
        self.embeddings.word_embeddings = new_embeddings

    def _prune_heads(self, heads_to_prune):
        """
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        """
        for layer, heads in heads_to_prune.items():
            self.transformer.layer[layer].attention.prune_heads(heads)

    @add_start_docstrings_to_model_forward(DISTILBERT_INPUTS_DOCSTRING.format("batch_size, num_choices"))
    @add_code_sample_docstrings(
        tokenizer_class=_TOKENIZER_FOR_DOC,
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=BaseModelOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        head_mask=None,
        inputs_embeds=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
    ):
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        if input_ids is not None and inputs_embeds is not None:
            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
        elif input_ids is not None:
            input_shape = input_ids.size()
        elif inputs_embeds is not None:
            input_shape = inputs_embeds.size()[:-1]
        else:
            raise ValueError("You have to specify either input_ids or inputs_embeds")

        device = input_ids.device if input_ids is not None else inputs_embeds.device

        if attention_mask is None:
            attention_mask = torch.ones(input_shape, device=device)  # (bs, seq_length)

        # Prepare head mask if needed
        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)

        if inputs_embeds is None:
            inputs_embeds = self.embeddings(input_ids)  # (bs, seq_length, dim)
        return self.transformer(
            x=inputs_embeds,
            attn_mask=attention_mask,
            head_mask=head_mask,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )


@add_start_docstrings(
    """DistilBert Model with a `masked language modeling` head on top. """,
    DISTILBERT_START_DOCSTRING,
)
class DistilBertForMaskedLM(DistilBertPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)

        self.distilbert = DistilBertModel(config)
        self.vocab_transform = nn.Linear(config.dim, config.dim)
        self.vocab_layer_norm = nn.LayerNorm(config.dim, eps=1e-12)
        self.vocab_projector = nn.Linear(config.dim, config.vocab_size)

        self.init_weights()

        self.mlm_loss_fct = nn.CrossEntropyLoss()

    def get_output_embeddings(self):
        return self.vocab_projector

    def set_output_embeddings(self, new_embeddings):
        self.vocab_projector = new_embeddings

    @add_start_docstrings_to_model_forward(DISTILBERT_INPUTS_DOCSTRING.format("batch_size, num_choices"))
    @add_code_sample_docstrings(
        tokenizer_class=_TOKENIZER_FOR_DOC,
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=MaskedLMOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        head_mask=None,
        inputs_embeds=None,
        labels=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
    ):
        r"""
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
            Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ...,
            config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored
            (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``.
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        dlbrt_output = self.distilbert(
            input_ids=input_ids,
            attention_mask=attention_mask,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        hidden_states = dlbrt_output[0]  # (bs, seq_length, dim)
        prediction_logits = self.vocab_transform(hidden_states)  # (bs, seq_length, dim)
        prediction_logits = gelu(prediction_logits)  # (bs, seq_length, dim)
        prediction_logits = self.vocab_layer_norm(prediction_logits)  # (bs, seq_length, dim)
        prediction_logits = self.vocab_projector(prediction_logits)  # (bs, seq_length, vocab_size)

        mlm_loss = None
        if labels is not None:
            mlm_loss = self.mlm_loss_fct(prediction_logits.view(-1, prediction_logits.size(-1)), labels.view(-1))

        if not return_dict:
            output = (prediction_logits,) + dlbrt_output[1:]
            return ((mlm_loss,) + output) if mlm_loss is not None else output

        return MaskedLMOutput(
            loss=mlm_loss,
            logits=prediction_logits,
            hidden_states=dlbrt_output.hidden_states,
            attentions=dlbrt_output.attentions,
        )


@add_start_docstrings(
    """
    DistilBert Model transformer with a sequence classification/regression head on top (a linear layer on top of the
    pooled output) e.g. for GLUE tasks.
    """,
    DISTILBERT_START_DOCSTRING,
)
class DistilBertForSequenceClassification(DistilBertPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels

        self.distilbert = DistilBertModel(config)
        self.pre_classifier = nn.Linear(config.dim, config.dim)
        self.classifier = nn.Linear(config.dim, config.num_labels)
        self.dropout = nn.Dropout(config.seq_classif_dropout)

        self.init_weights()

    @add_start_docstrings_to_model_forward(DISTILBERT_INPUTS_DOCSTRING.format("batch_size, num_choices"))
    @add_code_sample_docstrings(
        tokenizer_class=_TOKENIZER_FOR_DOC,
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=SequenceClassifierOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        head_mask=None,
        inputs_embeds=None,
        labels=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
    ):
        r"""
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
            Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ...,
            config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
            If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        distilbert_output = self.distilbert(
            input_ids=input_ids,
            attention_mask=attention_mask,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        hidden_state = distilbert_output[0]  # (bs, seq_len, dim)
        pooled_output = hidden_state[:, 0]  # (bs, dim)
        pooled_output = self.pre_classifier(pooled_output)  # (bs, dim)
        pooled_output = nn.ReLU()(pooled_output)  # (bs, dim)
        pooled_output = self.dropout(pooled_output)  # (bs, dim)
        logits = self.classifier(pooled_output)  # (bs, num_labels)

        loss = None
        if labels is not None:
            if self.num_labels == 1:
                loss_fct = nn.MSELoss()
                loss = loss_fct(logits.view(-1), labels.view(-1))
            else:
                loss_fct = nn.CrossEntropyLoss()
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))

        if not return_dict:
            output = (logits,) + distilbert_output[1:]
            return ((loss,) + output) if loss is not None else output

        return SequenceClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=distilbert_output.hidden_states,
            attentions=distilbert_output.attentions,
        )


@add_start_docstrings(
    """
    DistilBert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a
    linear layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
    """,
    DISTILBERT_START_DOCSTRING,
)
class DistilBertForQuestionAnswering(DistilBertPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)

        self.distilbert = DistilBertModel(config)
        self.qa_outputs = nn.Linear(config.dim, config.num_labels)
        assert config.num_labels == 2
        self.dropout = nn.Dropout(config.qa_dropout)

        self.init_weights()

    @add_start_docstrings_to_model_forward(DISTILBERT_INPUTS_DOCSTRING.format("batch_size, num_choices"))
    @add_code_sample_docstrings(
        tokenizer_class=_TOKENIZER_FOR_DOC,
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=QuestionAnsweringModelOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        head_mask=None,
        inputs_embeds=None,
        start_positions=None,
        end_positions=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
    ):
        r"""
        start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
            Labels for position (index) of the start of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the
            sequence are not taken into account for computing the loss.
        end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
            Labels for position (index) of the end of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the
            sequence are not taken into account for computing the loss.
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        distilbert_output = self.distilbert(
            input_ids=input_ids,
            attention_mask=attention_mask,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        hidden_states = distilbert_output[0]  # (bs, max_query_len, dim)

        hidden_states = self.dropout(hidden_states)  # (bs, max_query_len, dim)
        logits = self.qa_outputs(hidden_states)  # (bs, max_query_len, 2)
        start_logits, end_logits = logits.split(1, dim=-1)
        start_logits = start_logits.squeeze(-1)  # (bs, max_query_len)
        end_logits = end_logits.squeeze(-1)  # (bs, max_query_len)

        total_loss = None
        if start_positions is not None and end_positions is not None:
            # If we are on multi-GPU, split add a dimension
            if len(start_positions.size()) > 1:
                start_positions = start_positions.squeeze(-1)
            if len(end_positions.size()) > 1:
                end_positions = end_positions.squeeze(-1)
            # sometimes the start/end positions are outside our model inputs, we ignore these terms
            ignored_index = start_logits.size(1)
            start_positions.clamp_(0, ignored_index)
            end_positions.clamp_(0, ignored_index)

            loss_fct = nn.CrossEntropyLoss(ignore_index=ignored_index)
            start_loss = loss_fct(start_logits, start_positions)
            end_loss = loss_fct(end_logits, end_positions)
            total_loss = (start_loss + end_loss) / 2

        if not return_dict:
            output = (start_logits, end_logits) + distilbert_output[1:]
            return ((total_loss,) + output) if total_loss is not None else output

        return QuestionAnsweringModelOutput(
            loss=total_loss,
            start_logits=start_logits,
            end_logits=end_logits,
            hidden_states=distilbert_output.hidden_states,
            attentions=distilbert_output.attentions,
        )


@add_start_docstrings(
    """
    DistilBert Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g.
    for Named-Entity-Recognition (NER) tasks.
    """,
    DISTILBERT_START_DOCSTRING,
)
class DistilBertForTokenClassification(DistilBertPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels

        self.distilbert = DistilBertModel(config)
        self.dropout = nn.Dropout(config.dropout)
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)

        self.init_weights()

    @add_start_docstrings_to_model_forward(DISTILBERT_INPUTS_DOCSTRING)
    @add_code_sample_docstrings(
        tokenizer_class=_TOKENIZER_FOR_DOC,
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=TokenClassifierOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        head_mask=None,
        inputs_embeds=None,
        labels=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
    ):
        r"""
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
            Labels for computing the token classification loss. Indices should be in ``[0, ..., config.num_labels -
            1]``.
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        outputs = self.distilbert(
            input_ids,
            attention_mask=attention_mask,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        sequence_output = outputs[0]

        sequence_output = self.dropout(sequence_output)
        logits = self.classifier(sequence_output)

        loss = None
        if labels is not None:
            loss_fct = CrossEntropyLoss()
            # Only keep active parts of the loss
            if attention_mask is not None:
                active_loss = attention_mask.view(-1) == 1
                active_logits = logits.view(-1, self.num_labels)
                active_labels = torch.where(
                    active_loss, labels.view(-1), torch.tensor(loss_fct.ignore_index).type_as(labels)
                )
                loss = loss_fct(active_logits, active_labels)
            else:
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))

        if not return_dict:
            output = (logits,) + outputs[1:]
            return ((loss,) + output) if loss is not None else output

        return TokenClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )


@add_start_docstrings(
    """
    DistilBert Model with a multiple choice classification head on top (a linear layer on top of the pooled output and
    a softmax) e.g. for RocStories/SWAG tasks.
    """,
    DISTILBERT_START_DOCSTRING,
)
class DistilBertForMultipleChoice(DistilBertPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)

        self.distilbert = DistilBertModel(config)
        self.pre_classifier = nn.Linear(config.dim, config.dim)
        self.classifier = nn.Linear(config.dim, 1)
        self.dropout = nn.Dropout(config.seq_classif_dropout)

        self.init_weights()

    @add_start_docstrings_to_model_forward(
        DISTILBERT_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")
    )
    @replace_return_docstrings(output_type=MultipleChoiceModelOutput, config_class=_CONFIG_FOR_DOC)
    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        head_mask=None,
        inputs_embeds=None,
        labels=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
    ):
        r"""
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
            Labels for computing the multiple choice classification loss. Indices should be in ``[0, ...,
            num_choices-1]`` where :obj:`num_choices` is the size of the second dimension of the input tensors. (See
            :obj:`input_ids` above)

        Returns:

        Examples::

            >>> from transformers import DistilBertTokenizer, DistilBertForMultipleChoice
            >>> import torch

            >>> tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')
            >>> model = DistilBertForMultipleChoice.from_pretrained('distilbert-base-cased')

            >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
            >>> choice0 = "It is eaten with a fork and a knife."
            >>> choice1 = "It is eaten while held in the hand."
            >>> labels = torch.tensor(0).unsqueeze(0)  # choice0 is correct (according to Wikipedia ;)), batch size 1

            >>> encoding = tokenizer([[prompt, choice0], [prompt, choice1]], return_tensors='pt', padding=True)
            >>> outputs = model(**{k: v.unsqueeze(0) for k,v in encoding.items()}, labels=labels) # batch size is 1

            >>> # the linear classifier still needs to be trained
            >>> loss = outputs.loss
            >>> logits = outputs.logits
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]

        input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None
        attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
        inputs_embeds = (
            inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1))
            if inputs_embeds is not None
            else None
        )

        outputs = self.distilbert(
            input_ids,
            attention_mask=attention_mask,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        hidden_state = outputs[0]  # (bs * num_choices, seq_len, dim)
        pooled_output = hidden_state[:, 0]  # (bs * num_choices, dim)
        pooled_output = self.pre_classifier(pooled_output)  # (bs * num_choices, dim)
        pooled_output = nn.ReLU()(pooled_output)  # (bs * num_choices, dim)
        pooled_output = self.dropout(pooled_output)  # (bs * num_choices, dim)
        logits = self.classifier(pooled_output)  # (bs * num_choices, 1)

        reshaped_logits = logits.view(-1, num_choices)  # (bs, num_choices)

        loss = None
        if labels is not None:
            loss_fct = CrossEntropyLoss()
            loss = loss_fct(reshaped_logits, labels)

        if not return_dict:
            output = (reshaped_logits,) + outputs[1:]
            return ((loss,) + output) if loss is not None else output

        return MultipleChoiceModelOutput(
            loss=loss,
            logits=reshaped_logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )


================================================
FILE: patches/transformers/4.5.1/modeling_distilbert.py
================================================
# coding=utf-8
# Copyright 2019-present, the HuggingFace Inc. team, The Google AI Language Team and Facebook, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
 PyTorch DistilBERT model adapted in part from Facebook, Inc XLM model (https://github.com/facebookresearch/XLM) and in
 part from HuggingFace PyTorch version of Google AI Bert model (https://github.com/google-research/bert)
"""


import copy
import math

import numpy as np
import torch
import torch.nn as nn
from torch.nn import CrossEntropyLoss

from ...activations import gelu
from ...file_utils import (
    add_code_sample_docstrings,
    add_start_docstrings,
    add_start_docstrings_to_model_forward,
    replace_return_docstrings,
)
from ...modeling_outputs import (
    BaseModelOutput,
    MaskedLMOutput,
    MultipleChoiceModelOutput,
    QuestionAnsweringModelOutput,
    SequenceClassifierOutput,
    TokenClassifierOutput,
)
from ...modeling_utils import (
    PreTrainedModel,
    apply_chunking_to_forward,
    find_pruneable_heads_and_indices,
    prune_linear_layer,
)
from ...utils import logging
from .configuration_distilbert import DistilBertConfig


logger = logging.get_logger(__name__)
_CHECKPOINT_FOR_DOC = "distilbert-base-uncased"
_CONFIG_FOR_DOC = "DistilBertConfig"
_TOKENIZER_FOR_DOC = "DistilBertTokenizer"

DISTILBERT_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "distilbert-base-uncased",
    "distilbert-base-uncased-distilled-squad",
    "distilbert-base-cased",
    "distilbert-base-cased-distilled-squad",
    "distilbert-base-german-cased",
    "distilbert-base-multilingual-cased",
    "distilbert-base-uncased-finetuned-sst-2-english",
    # See all DistilBERT models at https://huggingface.co/models?filter=distilbert
]


# UTILS AND BUILDING BLOCKS OF THE ARCHITECTURE #


def create_sinusoidal_embeddings(n_pos, dim, out):
    position_enc = np.array([[pos / np.power(10000, 2 * (j // 2) / dim) for j in range(dim)] for pos in range(n_pos)])
    out.requires_grad = False
    out[:, 0::2] = torch.FloatTensor(np.sin(position_enc[:, 0::2]))
    out[:, 1::2] = torch.FloatTensor(np.cos(position_enc[:, 1::2]))
    out.detach_()


class Embeddings(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.word_embeddings = nn.Embedding(config.vocab_size, config.dim, padding_idx=config.pad_token_id)
        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.dim)
        if config.sinusoidal_pos_embds:
            create_sinusoidal_embeddings(
                n_pos=config.max_position_embeddings, dim=config.dim, out=self.position_embeddings.weight
            )

        self.LayerNorm = nn.LayerNorm(config.dim, eps=1e-12)
        self.dropout = nn.Dropout(config.dropout)

    def forward(self, input_ids):
        """
        Parameters:
            input_ids: torch.tensor(bs, max_seq_length) The token ids to embed.

        Returns: torch.tensor(bs, max_seq_length, dim) The embedded tokens (plus position embeddings, no token_type
        embeddings)
        """
        seq_length = input_ids.size(1)
        position_ids = torch.arange(seq_length, dtype=torch.long, device=input_ids.device)  # (max_seq_length)
        position_ids = position_ids.unsqueeze(0).expand_as(input_ids)  # (bs, max_seq_length)

        word_embeddings = self.word_embeddings(input_ids)  # (bs, max_seq_length, dim)
        position_embeddings = self.position_embeddings(position_ids)  # (bs, max_seq_length, dim)

        embeddings = word_embeddings + position_embeddings  # (bs, max_seq_length, dim)
        embeddings = self.LayerNorm(embeddings)  # (bs, max_seq_length, dim)
        embeddings = self.dropout(embeddings)  # (bs, max_seq_length, dim)
        return embeddings


class MultiHeadSelfAttention(nn.Module):
    def __init__(self, config):
        super().__init__()

        self.n_heads = config.n_heads
        self.dim = config.dim
        self.dropout = nn.Dropout(p=config.attention_dropout)

        assert self.dim % self.n_heads == 0

        self.q_lin = nn.Linear(in_features=config.dim, out_features=config.dim)
        self.k_lin = nn.Linear(in_features=config.dim, out_features=config.dim)
        self.v_lin = nn.Linear(in_features=config.dim, out_features=config.dim)
        self.out_lin = nn.Linear(in_features=config.dim, out_features=config.dim)

        self.pruned_heads = set()

    def prune_heads(self, heads):
        attention_head_size = self.dim // self.n_heads
        if len(heads) == 0:
            return
        heads, index = find_pruneable_heads_and_indices(heads, self.n_heads, attention_head_size, self.pruned_heads)
        # Prune linear layers
        self.q_lin = prune_linear_layer(self.q_lin, index)
        self.k_lin = prune_linear_layer(self.k_lin, index)
        self.v_lin = prune_linear_layer(self.v_lin, index)
        self.out_lin = prune_linear_layer(self.out_lin, index, dim=1)
        # Update hyper params
        self.n_heads = self.n_heads - len(heads)
        self.dim = attention_head_size * self.n_heads
        self.pruned_heads = self.pruned_heads.union(heads)

    def forward(self, query, key, value, mask, head_mask=None, output_attentions=False):
        """
        Parameters:
            query: torch.tensor(bs, seq_length, dim)
            key: torch.tensor(bs, seq_length, dim)
            value: torch.tensor(bs, seq_length, dim)
            mask: torch.tensor(bs, seq_length)

        Returns:
            weights: torch.tensor(bs, n_heads, seq_length, seq_length) Attention weights context: torch.tensor(bs,
            seq_length, dim) Contextualized layer. Optional: only if `output_attentions=True`
        """
        bs, q_length, dim = query.size()
        k_length = key.size(1)
        # assert dim == self.dim, f'Dimensions do not match: {dim} input vs {self.dim} configured'
        # assert key.size() == value.size()

        dim_per_head = self.dim // self.n_heads

        mask_reshp = (bs, 1, 1, k_length)

        def shape(x):
            """ separate heads """
            return x.view(bs, -1, self.n_heads, dim_per_head).transpose(1, 2)

        def unshape(x):
            """ group heads """
            return x.transpose(1, 2).contiguous().view(bs, -1, self.n_heads * dim_per_head)

        q = shape(self.q_lin(query))  # (bs, n_heads, q_length, dim_per_head)
        k = shape(self.k_lin(key))  # (bs, n_heads, k_length, dim_per_head)
        v = shape(self.v_lin(value))  # (bs, n_heads, k_length, dim_per_head)

        q = q / math.sqrt(dim_per_head)  # (bs, n_heads, q_length, dim_per_head)
        scores = torch.matmul(q, k.transpose(2, 3))  # (bs, n_heads, q_length, k_length)
        mask = mask.view(mask_reshp).expand_as(scores)  # (bs, n_heads, q_length, k_length)
        scores.masked_fill_((mask == 0), -float("inf"))  # (bs, n_heads, q_length, k_length)

        weights = nn.Softmax(dim=-1)(scores)  # (bs, n_heads, q_length, k_length)
        weights = self.dropout(weights)  # (bs, n_heads, q_length, k_length)

        # Mask heads if we want to
        if head_mask is not None:
            weights = weights * head_mask

        context = torch.matmul(weights, v)  # (bs, n_heads, q_length, dim_per_head)
        context = unshape(context)  # (bs, q_length, dim)
        context = self.out_lin(context)  # (bs, q_length, dim)

        if output_attentions:
            return (context, weights)
        else:
            return (context,)


class FFN(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.dropout = nn.Dropout(p=config.dropout)
        self.chunk_size_feed_forward = config.chunk_size_feed_forward
        self.seq_len_dim = 1
        self.lin1 = nn.Linear(in_features=config.dim, out_features=config.hidden_dim)
        self.lin2 = nn.Linear(in_features=config.hidden_dim, out_features=config.dim)
        assert config.activation in ["relu", "gelu"], f"activation ({config.activation}) must be in ['relu', 'gelu']"
        self.activation = gelu if config.activation == "gelu" else nn.ReLU()

    def forward(self, input):
        return apply_chunking_to_forward(self.ff_chunk, self.chunk_size_feed_forward, self.seq_len_dim, input)

    def ff_chunk(self, input):
        x = self.lin1(input)
        x = self.activation(x)
        x = self.lin2(x)
        x = self.dropout(x)
        return x


class TransformerBlock(nn.Module):
    def __init__(self, config):
        super().__init__()

        assert config.dim % config.n_heads == 0

        self.attention = MultiHeadSelfAttention(config)
        self.sa_layer_norm = nn.LayerNorm(normalized_shape=config.dim, eps=1e-12)

        self.ffn = FFN(config)
        self.output_layer_norm = nn.LayerNorm(normalized_shape=config.dim, eps=1e-12)

    def forward(self, x, attn_mask=None, head_mask=None, output_attentions=False):
        """
        Parameters:
            x: torch.tensor(bs, seq_length, dim)
            attn_mask: torch.tensor(bs, seq_length)

        Returns:
            sa_weights: torch.tensor(bs, n_heads, seq_length, seq_length) The attention weights ffn_output:
            torch.tensor(bs, seq_length, dim) The output of the transformer block contextualization.
        """
        # Self-Attention
        sa_output = self.attention(
            query=x,
            key=x,
            value=x,
            mask=attn_mask,
            head_mask=head_mask,
            output_attentions=output_attentions,
        )
        if output_attentions:
            sa_output, sa_weights = sa_output  # (bs, seq_length, dim), (bs, n_heads, seq_length, seq_length)
        else:  # To handle these `output_attentions` or `output_hidden_states` cases returning tuples
            assert type(sa_output) == tuple
            sa_output = sa_output[0]
        sa_output = self.sa_layer_norm(sa_output + x)  # (bs, seq_length, dim)

        # Feed Forward Network
        ffn_output = self.ffn(sa_output)  # (bs, seq_length, dim)
        ffn_output = self.output_layer_norm(ffn_output + sa_output)  # (bs, seq_length, dim)

        output = (ffn_output,)
        if output_attentions:
            output = (sa_weights,) + output
        return output


class Transformer(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.n_layers = config.n_layers

        layer = TransformerBlock(config)
        self.layer = nn.ModuleList([copy.deepcopy(layer) for _ in range(config.n_layers)])

    def forward(
        self, x, attn_mask=None, head_mask=None, output_attentions=False, output_hidden_states=False, return_dict=None
    ):  # docstyle-ignore
        """
        Parameters:
            x: torch.tensor(bs, seq_length, dim) Input sequence embedded.
            attn_mask: torch.tensor(bs, seq_length) Attention mask on the sequence.

        Returns:
            hidden_state: torch.tensor(bs, seq_length, dim) Sequence of hidden states in the last (top)
            layer all_hidden_states: Tuple[torch.tensor(bs, seq_length, dim)]
                Tuple of length n_layers with the hidden states from each layer.
                Optional: only if output_hidden_states=True
            all_attentions: Tuple[torch.tensor(bs, n_heads, seq_length, seq_length)]
                Tuple of length n_layers with the attention weights from each layer
                Optional: only if output_attentions=True
        """
        all_hidden_states = () if output_hidden_states else None
        all_attentions = () if output_attentions else None

        hidden_state = x
        for i, layer_module in enumerate(self.layer):
            if output_hidden_states:
                all_hidden_states = all_hidden_states + (hidden_state,)

            layer_outputs = layer_module(
                x=hidden_state, attn_mask=attn_mask, head_mask=head_mask[i], output_attentions=output_attentions
            )
            hidden_state = layer_outputs[-1]

            if output_attentions:
                assert len(layer_outputs) == 2
                attentions = layer_outputs[0]
                all_attentions = all_attentions + (attentions,)
            else:
                assert len(layer_outputs) == 1

        # Add last layer
        if output_hidden_states:
            all_hidden_states = all_hidden_states + (hidden_state,)

        if not return_dict:
            return tuple(v for v in [hidden_state, all_hidden_states, all_attentions] if v is not None)
        return BaseModelOutput(
            last_hidden_state=hidden_state, hidden_states=all_hidden_states, attentions=all_attentions
        )


# INTERFACE FOR ENCODER AND TASK SPECIFIC MODEL #
class DistilBertPreTrainedModel(PreTrainedModel):
    """
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    """

    config_class = DistilBertConfig
    load_tf_weights = None
    base_model_prefix = "distilbert"

    def _init_weights(self, module):
        """Initialize the weights."""
        if isinstance(module, nn.Linear):
            # Slightly different from the TF version which uses truncated_normal for initialization
            # cf https://github.com/pytorch/pytorch/pull/5617
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)


DISTILBERT_START_DOCSTRING = r"""

    This model inherits from :class:`~transformers.PreTrainedModel`. Check the superclass documentation for the generic
    methods the library implements for all its model (such as downloading or saving, resizing the input embeddings,
    pruning heads etc.)

    This model is also a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`__
    subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to
    general usage and behavior.

    Parameters:
        config (:class:`~transformers.DistilBertConfig`): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model
            weights.
"""

DISTILBERT_INPUTS_DOCSTRING = r"""
    Args:
        input_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using :class:`~transformers.DistilBertTokenizer`. See
            :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
            details.

            `What are input IDs? <../glossary.html#input-ids>`__
        attention_mask (:obj:`torch.FloatTensor` of shape :obj:`({0})`, `optional`):
            Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            `What are attention masks? <../glossary.html#attention-mask>`__
        head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`):
            Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.

        inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`({0}, hidden_size)`, `optional`):
            Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
            This is useful if you want more control over how to convert :obj:`input_ids` indices into associated
            vectors than the model's internal embedding lookup matrix.
        output_attentions (:obj:`bool`, `optional`):
            Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
            tensors for more detail.
        output_hidden_states (:obj:`bool`, `optional`):
            Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
            more detail.
        return_dict (:obj:`bool`, `optional`):
            Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
"""


@add_start_docstrings(
    "The bare DistilBERT encoder/transformer outputting raw hidden-states without any specific head on top.",
    DISTILBERT_START_DOCSTRING,
)
class DistilBertModel(DistilBertPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)

        self.embeddings = Embeddings(config)  # Embeddings
        self.transformer = Transformer(config)  # Encoder

        self.init_weights()

    def get_input_embeddings(self):
        return self.embeddings.word_embeddings

    def set_input_embeddings(self, new_embeddings):
        self.embeddings.word_embeddings = new_embeddings

    def _prune_heads(self, heads_to_prune):
        """
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        """
        for layer, heads in heads_to_prune.items():
            self.transformer.layer[layer].attention.prune_heads(heads)

    @add_start_docstrings_to_model_forward(DISTILBERT_INPUTS_DOCSTRING.format("batch_size, num_choices"))
    @add_code_sample_docstrings(
        tokenizer_class=_TOKENIZER_FOR_DOC,
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=BaseModelOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        head_mask=None,
        inputs_embeds=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
    ):
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        if input_ids is not None and inputs_embeds is not None:
            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
        elif input_ids is not None:
            input_shape = input_ids.size()
        elif inputs_embeds is not None:
            input_shape = inputs_embeds.size()[:-1]
        else:
            raise ValueError("You have to specify either input_ids or inputs_embeds")

        device = input_ids.device if input_ids is not None else inputs_embeds.device

        if attention_mask is None:
            attention_mask = torch.ones(input_shape, device=device)  # (bs, seq_length)

        # Prepare head mask if needed
        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)

        if inputs_embeds is None:
            inputs_embeds = self.embeddings(input_ids)  # (bs, seq_length, dim)
        return self.transformer(
            x=inputs_embeds,
            attn_mask=attention_mask,
            head_mask=head_mask,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )


@add_start_docstrings(
    """DistilBert Model with a `masked language modeling` head on top. """,
    DISTILBERT_START_DOCSTRING,
)
class DistilBertForMaskedLM(DistilBertPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)

        self.distilbert = DistilBertModel(config)
        self.vocab_transform = nn.Linear(config.dim, config.dim)
        self.vocab_layer_norm = nn.LayerNorm(config.dim, eps=1e-12)
        self.vocab_projector = nn.Linear(config.dim, config.vocab_size)

        self.init_weights()

        self.mlm_loss_fct = nn.CrossEntropyLoss()

    def get_output_embeddings(self):
        return self.vocab_projector

    def set_output_embeddings(self, new_embeddings):
        self.vocab_projector = new_embeddings

    @add_start_docstrings_to_model_forward(DISTILBERT_INPUTS_DOCSTRING.format("batch_size, num_choices"))
    @add_code_sample_docstrings(
        tokenizer_class=_TOKENIZER_FOR_DOC,
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=MaskedLMOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        head_mask=None,
        inputs_embeds=None,
        labels=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
    ):
        r"""
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
            Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ...,
            config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored
            (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``.
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        dlbrt_output = self.distilbert(
            input_ids=input_ids,
            attention_mask=attention_mask,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        hidden_states = dlbrt_output[0]  # (bs, seq_length, dim)
        prediction_logits = self.vocab_transform(hidden_states)  # (bs, seq_length, dim)
        prediction_logits = gelu(prediction_logits)  # (bs, seq_length, dim)
        prediction_logits = self.vocab_layer_norm(prediction_logits)  # (bs, seq_length, dim)
        prediction_logits = self.vocab_projector(prediction_logits)  # (bs, seq_length, vocab_size)

        mlm_loss = None
        if labels is not None:
            mlm_loss = self.mlm_loss_fct(prediction_logits.view(-1, prediction_logits.size(-1)), labels.view(-1))

        if not return_dict:
            output = (prediction_logits,) + dlbrt_output[1:]
            return ((mlm_loss,) + output) if mlm_loss is not None else output

        return MaskedLMOutput(
            loss=mlm_loss,
            logits=prediction_logits,
            hidden_states=dlbrt_output.hidden_states,
            attentions=dlbrt_output.attentions,
        )


@add_start_docstrings(
    """
    DistilBert Model transformer with a sequence classification/regression head on top (a linear layer on top of the
    pooled output) e.g. for GLUE tasks.
    """,
    DISTILBERT_START_DOCSTRING,
)
class DistilBertForSequenceClassification(DistilBertPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels

        self.distilbert = DistilBertModel(config)
        self.pre_classifier = nn.Linear(config.dim, config.dim)
        self.classifier = nn.Linear(config.dim, config.num_labels)
        self.dropout = nn.Dropout(config.seq_classif_dropout)

        self.init_weights()

    @add_start_docstrings_to_model_forward(DISTILBERT_INPUTS_DOCSTRING.format("batch_size, num_choices"))
    @add_code_sample_docstrings(
        tokenizer_class=_TOKENIZER_FOR_DOC,
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=SequenceClassifierOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        head_mask=None,
        inputs_embeds=None,
        labels=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
    ):
        r"""
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
            Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ...,
            config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
            If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        distilbert_output = self.distilbert(
            input_ids=input_ids,
            attention_mask=attention_mask,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        hidden_state = distilbert_output[0]  # (bs, seq_len, dim)
        pooled_output = hidden_state[:, 0]  # (bs, dim)
        pooled_output = self.pre_classifier(pooled_output)  # (bs, dim)
        pooled_output = nn.ReLU()(pooled_output)  # (bs, dim)
        pooled_output = self.dropout(pooled_output)  # (bs, dim)
        logits = self.classifier(pooled_output)  # (bs, num_labels)

        loss = None
        if labels is not None:
            if self.num_labels == 1:
                loss_fct = nn.MSELoss()
                loss = loss_fct(logits.view(-1), labels.view(-1))
            else:
                loss_fct = nn.CrossEntropyLoss()
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))

        if not return_dict:
            output = (logits,) + distilbert_output[1:]
            return ((loss,) + output) if loss is not None else output

        return SequenceClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=distilbert_output.hidden_states,
            attentions=distilbert_output.attentions,
        )


@add_start_docstrings(
    """
    DistilBert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a
    linear layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
    """,
    DISTILBERT_START_DOCSTRING,
)
class DistilBertForQuestionAnswering(DistilBertPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)

        self.distilbert = DistilBertModel(config)
        self.qa_outputs = nn.Linear(config.dim, config.num_labels)
        assert config.num_labels == 2
        self.dropout = nn.Dropout(config.qa_dropout)

        self.init_weights()

    @add_start_docstrings_to_model_forward(DISTILBERT_INPUTS_DOCSTRING.format("batch_size, num_choices"))
    @add_code_sample_docstrings(
        tokenizer_class=_TOKENIZER_FOR_DOC,
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=QuestionAnsweringModelOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        head_mask=None,
        inputs_embeds=None,
        start_positions=None,
        end_positions=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
    ):
        r"""
        start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
            Labels for position (index) of the start of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the
            sequence are not taken into account for computing the loss.
        end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
            Labels for position (index) of the end of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the
            sequence are not taken into account for computing the loss.
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        distilbert_output = self.distilbert(
            input_ids=input_ids,
            attention_mask=attention_mask,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        hidden_states = distilbert_output[0]  # (bs, max_query_len, dim)

        hidden_states = self.dropout(hidden_states)  # (bs, max_query_len, dim)
        logits = self.qa_outputs(hidden_states)  # (bs, max_query_len, 2)
        start_logits, end_logits = logits.split(1, dim=-1)
        start_logits = start_logits.squeeze(-1)  # (bs, max_query_len)
        end_logits = end_logits.squeeze(-1)  # (bs, max_query_len)

        total_loss = None
        if start_positions is not None and end_positions is not None:
            # If we are on multi-GPU, split add a dimension
            if len(start_positions.size()) > 1:
                start_positions = start_positions.squeeze(-1)
            if len(end_positions.size()) > 1:
                end_positions = end_positions.squeeze(-1)
            # sometimes the start/end positions are outside our model inputs, we ignore these terms
            ignored_index = start_logits.size(1)
            start_positions.clamp_(0, ignored_index)
            end_positions.clamp_(0, ignored_index)

            loss_fct = nn.CrossEntropyLoss(ignore_index=ignored_index)
            start_loss = loss_fct(start_logits, start_positions)
            end_loss = loss_fct(end_logits, end_positions)
            total_loss = (start_loss + end_loss) / 2

        if not return_dict:
            output = (start_logits, end_logits) + distilbert_output[1:]
            return ((total_loss,) + output) if total_loss is not None else output

        return QuestionAnsweringModelOutput(
            loss=total_loss,
            start_logits=start_logits,
            end_logits=end_logits,
            hidden_states=distilbert_output.hidden_states,
            attentions=distilbert_output.attentions,
        )


@add_start_docstrings(
    """
    DistilBert Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g.
    for Named-Entity-Recognition (NER) tasks.
    """,
    DISTILBERT_START_DOCSTRING,
)
class DistilBertForTokenClassification(DistilBertPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels

        self.distilbert = DistilBertModel(config)
        self.dropout = nn.Dropout(config.dropout)
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)

        self.init_weights()

    @add_start_docstrings_to_model_forward(DISTILBERT_INPUTS_DOCSTRING)
    @add_code_sample_docstrings(
        tokenizer_class=_TOKENIZER_FOR_DOC,
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=TokenClassifierOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        head_mask=None,
        inputs_embeds=None,
        labels=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
    ):
        r"""
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
            Labels for computing the token classification loss. Indices should be in ``[0, ..., config.num_labels -
            1]``.
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        outputs = self.distilbert(
            input_ids,
            attention_mask=attention_mask,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        sequence_output = outputs[0]

        sequence_output = self.dropout(sequence_output)
        logits = self.classifier(sequence_output)

        loss = None
        if labels is not None:
            loss_fct = CrossEntropyLoss()
            # Only keep active parts of the loss
            if attention_mask is not None:
                active_loss = attention_mask.view(-1) == 1
                active_logits = logits.view(-1, self.num_labels)
                active_labels = torch.where(
                    active_loss, labels.view(-1), torch.tensor(loss_fct.ignore_index).type_as(labels)
                )
                loss = loss_fct(active_logits, active_labels)
            else:
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))

        if not return_dict:
            output = (logits,) + outputs[1:]
            return ((loss,) + output) if loss is not None else output

        return TokenClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )


@add_start_docstrings(
    """
    DistilBert Model with a multiple choice classification head on top (a linear layer on top of the pooled output and
    a softmax) e.g. for RocStories/SWAG tasks.
    """,
    DISTILBERT_START_DOCSTRING,
)
class DistilBertForMultipleChoice(DistilBertPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)

        self.distilbert = DistilBertModel(config)
        self.pre_classifier = nn.Linear(config.dim, config.dim)
        self.classifier = nn.Linear(config.dim, 1)
        self.dropout = nn.Dropout(config.seq_classif_dropout)

        self.init_weights()

    @add_start_docstrings_to_model_forward(
        DISTILBERT_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")
    )
    @replace_return_docstrings(output_type=MultipleChoiceModelOutput, config_class=_CONFIG_FOR_DOC)
    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        head_mask=None,
        inputs_embeds=None,
        labels=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
    ):
        r"""
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
            Labels for computing the multiple choice classification loss. Indices should be in ``[0, ...,
            num_choices-1]`` where :obj:`num_choices` is the size of the second dimension of the input tensors. (See
            :obj:`input_ids` above)

        Returns:

        Examples::

            >>> from transformers import DistilBertTokenizer, DistilBertForMultipleChoice
            >>> import torch

            >>> tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')
            >>> model = DistilBertForMultipleChoice.from_pretrained('distilbert-base-cased')

            >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
            >>> choice0 = "It is eaten with a fork and a knife."
            >>> choice1 = "It is eaten while held in the hand."
            >>> labels = torch.tensor(0).unsqueeze(0)  # choice0 is correct (according to Wikipedia ;)), batch size 1

            >>> encoding = tokenizer([[prompt, choice0], [prompt, choice1]], return_tensors='pt', padding=True)
            >>> outputs = model(**{k: v.unsqueeze(0) for k,v in encoding.items()}, labels=labels) # batch size is 1

            >>> # the linear classifier still needs to be trained
            >>> loss = outputs.loss
            >>> logits = outputs.logits
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]

        input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None
        attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
        inputs_embeds = (
            inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1))
            if inputs_embeds is not None
            else None
        )

        outputs = self.distilbert(
            input_ids,
            attention_mask=attention_mask,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        hidden_state = outputs[0]  # (bs * num_choices, seq_len, dim)
        pooled_output = hidden_state[:, 0]  # (bs * num_choices, dim)
        pooled_output = self.pre_classifier(pooled_output)  # (bs * num_choices, dim)
        pooled_output = nn.ReLU()(pooled_output)  # (bs * num_choices, dim)
        pooled_output = self.dropout(pooled_output)  # (bs * num_choices, dim)
        logits = self.classifier(pooled_output)  # (bs * num_choices, 1)

        reshaped_logits = logits.view(-1, num_choices)  # (bs, num_choices)

        loss = None
        if labels is not None:
            loss_fct = CrossEntropyLoss()
            loss = loss_fct(reshaped_logits, labels)

        if not return_dict:
            output = (reshaped_logits,) + outputs[1:]
            return ((loss,) + output) if loss is not None else output

        return MultipleChoiceModelOutput(
            loss=loss,
            logits=reshaped_logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )


================================================
FILE: ros/CMakeLists.txt
================================================
cmake_minimum_required(VERSION 3.5)
project(jetson_voice_ros)

# Default to C99
if(NOT CMAKE_C_STANDARD)
  set(CMAKE_C_STANDARD 99)
endif()

# Default to C++14
if(NOT CMAKE_CXX_STANDARD)
  set(CMAKE_CXX_STANDARD 14)
endif()

if(CMAKE_COMPILER_IS_GNUCXX OR CMAKE_CXX_COMPILER_ID MATCHES "Clang")
  add_compile_options(-Wall -Wextra -Wpedantic)
endif()

# find dependencies
find_package(ament_cmake REQUIRED)
find_package(ament_cmake_python REQUIRED)
find_package(std_msgs REQUIRED)
find_package(rosidl_default_generators REQUIRED)

# generate messages
rosidl_generate_interfaces(${PROJECT_NAME}
  "msg/Audio.msg"
  "msg/AudioInfo.msg"
  "msg/IntentSlot.msg"
  "msg/QuestionAnswerQuery.msg"
  "msg/QuestionAnswerResult.msg"
  "msg/Slot.msg"
  DEPENDENCIES std_msgs
)

# install Python modules
ament_python_install_package(${PROJECT_NAME})

# install Python executables
file(GLOB python_nodes ${PROJECT_NAME}/*.py)

install(PROGRAMS
  ${python_nodes}
  DESTINATION lib/${PROJECT_NAME}
)

# install launch files
install(DIRECTORY
  launch
  DESTINATION share/${PROJECT_NAME}/
)
 
if(BUILD_TESTING)
  find_package(ament_lint_auto REQUIRED)
  # the following line skips the linter which checks for copyrights
  # uncomment the line when a copyright and license is not present in all source files
  #set(ament_cmake_copyright_FOUND TRUE)
  # the following line skips cpplint (only works in a git repo)
  # uncomment the line when this package is not in a git repo
  #set(ament_cmake_cpplint_FOUND TRUE)
  ament_lint_auto_find_test_dependencies()
endif()

ament_package()


================================================
FILE: ros/jetson_voice_ros/__init__.py
================================================


================================================
FILE: ros/jetson_voice_ros/asr.py
================================================
#!/usr/bin/env python3
import os
import rclpy
import numpy as np

from rclpy.node import Node
from std_msgs.msg import String

from jetson_voice import ASR
from jetson_voice_ros.msg import Audio


class ASRNode(Node):
    def __init__(self):
        super().__init__('asr', namespace='voice')
        
        # create topics
        self.audio_subscriber = self.create_subscription(Audio, 'audio_in', self.audio_listener, 10)
        self.transcript_publisher = self.create_publisher(String, 'transcripts', 10)
        self.partial_transcript_publisher = self.create_publisher(String, 'partial_transcripts', 10)
        
        # get node parameters
        self.declare_parameter('model', 'quartznet')
        self.model_name = str(self.get_parameter('model').value)
        self.get_logger().info(f'model = {self.model_name}')

        # load the ASR model
        self.asr = ASR(self.model_name)
        self.get_logger().info(f"model '{self.model_name}' ready")
        
        if self.asr.classification:
            raise ValueError(f'jetson_voice_ros/asr node does not support ASR classification models')
        
    def audio_listener(self, msg):
        if msg.info.sample_rate != self.asr.sample_rate:
            self.get_logger().warning(f"audio has sample_rate {msg.info.sample_rate}, but ASR expects sample_rate {self.asr.sample_rate}")
            
        samples = np.frombuffer(msg.data, dtype=msg.info.sample_format)
        self.get_logger().debug(f'recieved audio samples {samples.shape} dtype={samples.dtype}') # rms={np.sqrt(np.mean(samples**2))}')
        
        results = self.asr(samples)
        
        for transcript in results:
            text = transcript['text'].strip()
            
            if len(text) == 0:
                continue
                
            msg = String()
            msg.data = text

            self.get_logger().info(f"transcript:  {text}")

            if transcript['end']:
                self.transcript_publisher.publish(msg)
                
            self.partial_transcript_publisher.publish(msg)
                

def main(args=None):
    rclpy.init(args=args)
    node = ASRNode()
    rclpy.spin(node)
    node.destroy_node()
    rclpy.shutdown()


if __name__ == "__main__":
    main()

================================================
FILE: ros/jetson_voice_ros/audio_input.py
================================================
#!/usr/bin/env python3
import os
import rclpy
import numpy as np

from rclpy.node import Node

from jetson_voice.utils import AudioInput, audio_to_int16
from jetson_voice_ros.msg import Audio


class AudioInputNode(Node):
    def __init__(self):
        super().__init__('audio_input', namespace='voice')
        
        # create topics
        self.audio_publisher = self.create_publisher(Audio, 'audio_in', 10)
        
        # get node parameters
        self.declare_parameter('device', '')          # input audio device ID or name
        self.declare_parameter('sample_rate', 16000)  # sample rate (in Hz)
        self.declare_parameter('chunk_size', 16000)   # number of samples per buffer
        self.declare_parameter('resets', -1)          # number of times to reset the device (-1 is infinite)
        
        self.device_name = str(self.get_parameter('device').value)
        self.sample_rate = self.get_parameter('sample_rate').value
        self.chunk_size = self.get_parameter('chunk_size').value
        self.resets = self.get_parameter('resets').value
        
        self.reset_count = 0
        
        if self.device_name == '':
            raise ValueError("must set the 'device' parameter to either an input audio device ID/name or the path to a .wav file")
        
        self.get_logger().info(f'device={self.device_name}')
        self.get_logger().info(f'sample_rate={self.sample_rate}')
        self.get_logger().info(f'chunk_size={self.chunk_size}')
        self.get_logger().info(f'resets={self.resets}')
        
        # check if this is an audio device or a wav file
        file_ext = os.path.splitext(self.device_name)[1].lower()
        
        if file_ext == '.wav' or file_ext == '.wave':
            wav = self.device_name
            mic = ''
        else:
            wav = ''
            mic = self.device_name

        # create audio device
        self.device = AudioInput(wav=wav, mic=mic, sample_rate=self.sample_rate, chunk_size=self.chunk_size)
        self.device.open()

        # create a timer to check for audio samples
        self.timer = self.create_timer(self.chunk_size / self.sample_rate * 0.75, self.publish_audio)
        
        
    def publish_audio(self):
    
        while True:
            samples = self.device.next()
            
            if samples is not None:
                break
                
            self.get_logger().warning('no audio samples were returned from the audio device')
            
            if self.resets < 0 or self.reset_count < self.resets:
                self.reset_count += 1
                self.get_logger().warning(f'resetting audio device {self.device_name} (attempt {self.reset_count} of {self.resets})')
                self.device.reset()
            else:
                self.get_logger().error(f'maximum audio device resets has been reached ({self.resets})')
                return
                
        if samples.dtype == np.float32:  # convert to int16 to make the message smaller
            samples = audio_to_int16(samples)

        if samples.dtype != np.int16:  # the other voice nodes expect int16/float32
            raise ValueError(f'audio samples are expected to have datatype int16, but they were {samples.dtype}')
        
        self.get_logger().debug(f'publishing audio samples {samples.shape} dtype={samples.dtype}') # rms={np.sqrt(np.mean(samples**2))}')
        
        # publish message
        msg = Audio()
        
        msg.header.stamp = self.get_clock().now().to_msg()
        msg.header.frame_id = self.device_name

        msg.info.channels = 1  # AudioInput is set to mono
        msg.info.sample_rate = self.sample_rate
        msg.info.sample_format = str(samples.dtype)
        
        msg.data = samples.tobytes()
        
        self.audio_publisher.publish(msg)
        
        
def main(args=None):
    rclpy.init(args=args)
    node = AudioInputNode()
    rclpy.spin(node)
    node.destroy_node()
    rclpy.shutdown()


if __name__ == "__main__":
    main()

================================================
FILE: ros/jetson_voice_ros/audio_output.py
================================================
#!/usr/bin/env python3
import os
import rclpy
import numpy as np

from rclpy.node import Node

from jetson_voice import AudioOutput
from jetson_voice_ros.msg import Audio

from soundfile import SoundFile


class AudioOutputNode(Node):
    def __init__(self):
        super().__init__('audio_output', namespace='voice')
        
        # create topics
        self.audio_subscriber = self.create_subscription(Audio, 'audio_out', self.audio_listener, 10)
        
        # get node parameters
        self.declare_parameter('device', '')          # input audio device ID or name
        self.declare_parameter('sample_rate', 16000)  # sample rate (in Hz)
        self.declare_parameter('chunk_size', 4096)    # number of samples per buffer
        
        self.device_name = str(self.get_parameter('device').value)
        self.sample_rate = self.get_parameter('sample_rate').value
        self.chunk_size = self.get_parameter('chunk_size').value
        
        if self.device_name == '':
            raise ValueError("must set the 'device' parameter to either an input audio device ID/name or the path to a .wav file")
        
        self.get_logger().info(f'device={self.device_name}')
        self.get_logger().info(f'sample_rate={self.sample_rate}')
        self.get_logger().info(f'chunk_size={self.chunk_size}')
        
        # check if this is an audio device or a wav file
        file_ext = os.path.splitext(self.device_name)[1].lower()
        
        if file_ext == '.wav' or file_ext == '.wave':
            self.wav = SoundFile(self.device_name, mode='w', samplerate=self.sample_rate, channels=1)
            self.device = None
        else:
            self.wav = None
            self.device = AudioOutput(self.device_name, sample_rate=self.sample_rate, chunk_size=self.chunk_size)

    def audio_listener(self, msg):
        #self.get_logger().debug('recieved new audio message')
        #self.get_logger().debug(f'{msg.header}')
        #self.get_logger().debug(f'{msg.info}')
        
        if msg.info.sample_rate != self.sample_rate:
            self.get_logger().warning(f"audio has sample_rate {msg.info.sample_rate}, but audio device is using sample_rate {self.sample_rate}")
            
        samples = np.frombuffer(msg.data, dtype=msg.info.sample_format)
        
        self.get_logger().debug(f'recieved audio samples {samples.shape} dtype={samples.dtype}') # rms={np.sqrt(np.mean(samples**2))}')
        
        if self.device is not None:
            self.device.write(samples)
        else:
            self.wav.write(samples)


def main(args=None):
    rclpy.init(args=args)
    node = AudioOutputNode()
    rclpy.spin(node)
    node.destroy_node()
    rclpy.shutdown()


if __name__ == "__main__":
    main()

================================================
FILE: ros/jetson_voice_ros/nlp_intent_slot.py
================================================
#!/usr/bin/env python3
import os
import rclpy

from rclpy.node import Node
from std_msgs.msg import String

from jetson_voice import IntentSlot as IntentSlotFactory
from jetson_voice_ros.msg import IntentSlot, Slot


class NLPIntentSlotNode(Node):
    def __init__(self):
        super().__init__('nlp_intent_slot', namespace='voice')
        
        # create topics
        self.query_subscriber = self.create_subscription(String, 'intent_slot_query', self.query_listener, 10)
        self.result_publisher = self.create_publisher(IntentSlot, 'intent_slot_results', 10)

        # get node parameters
        self.declare_parameter('model', 'distilbert_intent')
        self.model_name = str(self.get_parameter('model').value)
        self.get_logger().info(f'model = {self.model_name}')

        # load the IntentSlot model
        self.model = IntentSlotFactory(self.model_name)
        self.get_logger().info(f"model '{self.model_name}' ready")
        
    def query_listener(self, msg):
        text = msg.data.strip()
        
        if len(text) == 0:
            return
            
        self.get_logger().info(f"running NLP Intent/Slot query:  '{text}'")
        
        # run the model
        results = self.model(text)
        
        self.get_logger().info(f"intent: '{results['intent']}'")
        self.get_logger().info(f"score:  {results['score']}")
        
        for slot in results['slots']:
            self.get_logger().info(str(slot))

        # create message
        msg = IntentSlot()
        
        msg.query.data = text
        msg.intent.data = results['intent']
        msg.score = float(results['score'])
        
        slots = []
        
        for slot in results['slots']:
            slot_msg = Slot()
            
            slot_msg.slot.data = slot['slot']
            slot_msg.text.data = slot['text']
            slot_msg.score = float(slot['score'])
            
            slots.append(slot_msg)
            
        msg.slots = tuple(slots)
        
        # publish message
        self.result_publisher.publish(msg)


def main(args=None):
    rclpy.init(args=args)
    node = NLPIntentSlotNode()
    rclpy.spin(node)
    node.destroy_node()
    rclpy.shutdown()


if __name__ == "__main__":
    main()

================================================
FILE: ros/jetson_voice_ros/nlp_question_answer.py
================================================
#!/usr/bin/env python3
import os
import rclpy

from rclpy.node import Node
from std_msgs.msg import String

from jetson_voice import QuestionAnswer as QuestionAnswerFactory
from jetson_voice_ros.msg import QuestionAnswerQuery, QuestionAnswerResult


class NLPQuestionAnswerNode(Node):
    def __init__(self):
        super().__init__('nlp_question_answer', namespace='voice')
        
        # create topics
        self.query_subscriber = self.create_subscription(QuestionAnswerQuery, 'question_answer_query', self.query_listener, 10)
        self.result_publisher = self.create_publisher(QuestionAnswerResult, 'question_answer_results', 10)

        # get node parameters
        self.declare_parameter('model', 'distilbert_qa_384')
        self.model_name = str(self.get_parameter('model').value)
        self.get_logger().info(f'model = {self.model_name}')

        # load the QA model
        self.model = QuestionAnswerFactory(self.model_name)
        self.get_logger().info(f"model '{self.model_name}' ready")
        
    def query_listener(self, msg):
        question = msg.question.data.strip()
        context = msg.context.data.strip()

        if len(question) == 0 or len(context) == 0:
            return
            
        self.get_logger().info(f"running NLP Question/Answer query:")
        self.get_logger().info(f"question:  '{question}'")
        self.get_logger().info(f"context:")
        self.get_logger().info(context)
        
        # run the model
        results = self.model((question,context))
        
        self.get_logger().info(f"answer: '{results['answer']}'")
        self.get_logger().info(f"score:  {results['score']}")

        # create message
        msg = QuestionAnswerResult()
        
        msg.question.data = question
        msg.answer.data = results['answer']
        msg.score = results['score']
        
        # publish message
        self.result_publisher.publish(msg)


def main(args=None):
    rclpy.init(args=args)
    node = NLPQuestionAnswerNode()
    rclpy.spin(node)
    node.destroy_node()
    rclpy.shutdown()


if __name__ == "__main__":
    main()

================================================
FILE: ros/jetson_voice_ros/tts.py
================================================
#!/usr/bin/env python3
import os
import rclpy
import numpy as np

from rclpy.node import Node
from std_msgs.msg import String

from jetson_voice import TTS
from jetson_voice.utils import audio_to_int16
from jetson_voice_ros.msg import Audio


class TTSNode(Node):
    def __init__(self):
        super().__init__('tts', namespace='voice')
        
        # create topics
        self.text_subscriber = self.create_subscription(String, 'tts_text', self.text_listener, 10)
        self.audio_publisher = self.create_publisher(Audio, 'tts_audio', 10)

        # get node parameters
        self.declare_parameter('model', 'fastpitch_hifigan')
        self.model_name = str(self.get_parameter('model').value)
        self.get_logger().info(f'model = {self.model_name}')

        # load the TTS model
        self.tts = TTS(self.model_name)
        self.get_logger().info(f"model '{self.model_name}' ready")
        
    def text_listener(self, msg):
        text = msg.data.strip()
        
        if len(text) == 0:
            return
            
        self.get_logger().info(f"running TTS on '{text}'")
        
        samples = self.tts(text)
        samples = audio_to_int16(samples)
        
        # publish message
        msg = Audio()
        
        msg.header.stamp = self.get_clock().now().to_msg()
        msg.header.frame_id = self.model_name

        msg.info.channels = 1
        msg.info.sample_rate = self.tts.sample_rate
        msg.info.sample_format = str(samples.dtype)
        
        msg.data = samples.tobytes()
        
        self.audio_publisher.publish(msg)
        

def main(args=None):
    rclpy.init(args=args)
    node = TTSNode()
    rclpy.spin(node)
    node.destroy_node()
    rclpy.shutdown()


if __name__ == "__main__":
    main()

================================================
FILE: ros/launch/asr.launch.py
================================================
#
# Launch file for playback of an audio stream or wav file.
#
import os

from launch import LaunchDescription
from launch.actions import IncludeLaunchDescription, DeclareLaunchArgument
from launch.launch_description_sources import PythonLaunchDescriptionSource
from launch.substitutions import ThisLaunchFileDir, LaunchConfiguration
from launch_ros.actions import Node


def generate_launch_description():
    
    log_level = DeclareLaunchArgument('log_level', default_value='info')
    asr_model = DeclareLaunchArgument('model', default_value='quartznet')
    input_device = DeclareLaunchArgument('input_device', default_value='/jetson-voice/data/audio/dusty.wav')

    audio_input = Node(package='jetson_voice_ros', node_executable='audio_input.py',
                       parameters=[
                            {"device": LaunchConfiguration('input_device')},
                       ],
                       arguments=['--ros-args', '--log-level', LaunchConfiguration('log_level')],
                       output='screen', emulate_tty=True)              
     
    asr_node = Node(package='jetson_voice_ros', node_executable='asr.py',
                        parameters=[
                            {"model": LaunchConfiguration('model')},
                        ],
                        arguments=['--ros-args', '--log-level', LaunchConfiguration('log_level')],
                        output='screen', emulate_tty=True)  
                        
    return LaunchDescription([
        log_level,
        asr_model,
        input_device,
        audio_input,
        asr_node,
    ])

================================================
FILE: ros/launch/audio_playback.launch.py
================================================
#
# Launch file for playback of an audio stream or wav file.
#
import os

from launch import LaunchDescription
from launch.actions import IncludeLaunchDescription, DeclareLaunchArgument
from launch.launch_description_sources import PythonLaunchDescriptionSource
from launch.substitutions import ThisLaunchFileDir, LaunchConfiguration
from launch_ros.actions import Node


def generate_launch_description():
    
    log_level = DeclareLaunchArgument('log_level', default_value='info')
    
    input_device = DeclareLaunchArgument('input_device', default_value='/jetson-voice/data/audio/dusty.wav')
    output_device = DeclareLaunchArgument('output_device', default_value='/jetson-voice/data/audio/output.wav')
    
    audio_input = Node(package='jetson_voice_ros', node_executable='audio_input.py',
                       parameters=[
                            {"device": LaunchConfiguration('input_device')},
                       ],
                       arguments=['--ros-args', '--log-level', LaunchConfiguration('log_level')],
                       output='screen', emulate_tty=True)              
     
    audio_output = Node(package='jetson_voice_ros', node_executable='audio_output.py',
                        parameters=[
                            {"device": LaunchConfiguration('output_device')},
                        ],
                        remappings=[
                            ("/voice/audio_out", "/voice/audio_in"),
                        ],
                        arguments=['--ros-args', '--log-level', LaunchConfiguration('log_level')],
                        output='screen', emulate_tty=True)  
                        
    return LaunchDescription([
        log_level,
        input_device,
        output_device,
        audio_input,
        audio_output,
    ])

================================================
FILE: ros/launch/tts.launch.py
================================================
#
# Launch file for playback of an audio stream or wav file.
#
import os

from launch import LaunchDescription
from launch.actions import IncludeLaunchDescription, DeclareLaunchArgument
from launch.launch_description_sources import PythonLaunchDescriptionSource
from launch.substitutions import ThisLaunchFileDir, LaunchConfiguration
from launch_ros.actions import Node


def generate_launch_description():
    
    log_level = DeclareLaunchArgument('log_level', default_value='info')
    tts_model = DeclareLaunchArgument('model', default_value='fastpitch_hifigan')
    output_device = DeclareLaunchArgument('output_device', default_value='/jetson-voice/data/audio/tts_test.wav')

    tts_node = Node(package='jetson_voice_ros', node_executable='tts.py',
                        parameters=[
                            {"model": LaunchConfiguration('model')},
                        ],
                        arguments=['--ros-args', '--log-level', LaunchConfiguration('log_level')],
                        output='screen', emulate_tty=True)  
   
    audio_output = Node(package='jetson_voice_ros', node_executable='audio_output.py',
                        parameters=[
                            {"device": LaunchConfiguration('output_device')},
                            {"sample_rate": 22050},
                        ],
                        remappings=[
                            ("/voice/audio_out", "/voice/tts_audio"),
                        ],
                        arguments=['--ros-args', '--log-level', LaunchConfiguration('log_level')],
                        output='screen', emulate_tty=True)  
                        
    return LaunchDescription([
        log_level,
        tts_model,
        output_device,
        tts_node,
        audio_output,
    ])

================================================
FILE: ros/msg/Audio.msg
================================================
std_msgs/Header header
AudioInfo info
uint8[] data

================================================
FILE: ros/msg/AudioInfo.msg
================================================
# Number of channels
uint8 channels

# Sampling rate [Hz]
uint32 sample_rate

# Audio format (e.g. int16, float32)
string sample_format

# Audio coding format (e.g. wav, mp3)
string coding_format

================================================
FILE: ros/msg/IntentSlot.msg
================================================
# the original query text
std_msgs/String query

# the classified intent label
std_msgs/String intent

# the intent probability between [0,1]
float32 score

# list of slots
jetson_voice_ros/Slot[] slots

================================================
FILE: ros/msg/QuestionAnswerQuery.msg
================================================
# the question being asked
std_msgs/String question

# the context paragraph
std_msgs/String context

================================================
FILE: ros/msg/QuestionAnswerResult.msg
================================================
# the question that was asked
std_msgs/String question

# the answer to the question
std_msgs/String answer

# the confidence of the answer betweeen [0,1]
float32 score

================================================
FILE: ros/msg/Slot.msg
================================================
# the slot class label
std_msgs/String slot

# the relevant text from the original query
std_msgs/String text

# classification probability between [0,1]
float32 score

================================================
FILE: ros/package.xml
================================================
<?xml version="1.0"?>
<?xml-model href="http://download.ros.org/schema/package_format3.xsd" schematypens="http://www.w3.org/2001/XMLSchema"?>
<package format="3">
  <name>jetson_voice_ros</name>
  <version>0.0.0</version>
  <description>ROS2 nodes for jetson_voice</description>
  <maintainer email="dustinf@nvidia.com">Dustin Franklin</maintainer>
  <license>MIT</license>

  <depend>rclpy</depend>
  <depend>std_msgs</depend>
  
  <buildtool_depend>ament_cmake</buildtool_depend>
  <buildtool_depend>ament_cmake_python</buildtool_depend>
  
  <build_depend>rosidl_default_generators</build_depend>
  <exec_depend>rosidl_default_runtime</exec_depend>
  <member_of_group>rosidl_interface_packages</member_of_group>

  <test_depend>ament_lint_auto</test_depend>
  <test_depend>ament_lint_common</test_depend>

  <export>
    <build_type>ament_cmake</build_type>
  </export>
</package>


================================================
FILE: scripts/list_audio_devices.py
================================================
#!/usr/bin/env python3
# coding: utf-8

from jetson_voice import list_audio_devices
    
list_audio_devices()
   
    
================================================
FILE: scripts/list_models.py
================================================
#!/usr/bin/env python3
# coding: utf-8

from jetson_voice import list_models
    
list_models()
   
    
================================================
FILE: scripts/nemo_export_onnx.py
================================================
#!/usr/bin/env python3
# coding: utf-8

import os
import argparse
import pprint
import json

import nemo
import nemo.collections.asr as nemo_asr
import nemo.collections.nlp as nemo_nlp
import nemo.collections.tts as nemo_tts

from omegaconf import OmegaConf, open_dict


model_types = {
    'asr' : nemo_asr.models.ASRModel,
    'asr_classification' : nemo_asr.models.ASRModel,
    'qa' : nemo_nlp.models.QAModel,
    'intent_slot' : nemo_nlp.models.IntentSlotClassificationModel,
    'text_classification' : nemo_nlp.models.TextClassificationModel,
    'token_classification' : nemo_nlp.models.TokenClassificationModel
}

parser = argparse.ArgumentParser()

parser.add_argument('--type', choices=model_types.keys(), type=str, required=True)
parser.add_argument('--model', type=str, required=True)   # 'QuartzNet15x5Base-En'
parser.add_argument('--output', default='', type=str, required=True)

args = parser.parse_args()

print('nemo version:', nemo.__version__)

# load model depending on extension/type
extension = os.path.splitext(args.model)[1].lower()

if extension == '.nemo':
    model = model_types[args.type].restore_from(args.model)
elif extension == '.ckpt':
    model = model_types[args.type].load_from_checkpoint(args.model)
else: #elif: len(extension) == 0:
    model = model_types[args.type].from_pretrained(model_name=args.model)
#else:
#    raise ValueError(f'model {args.model} has invalid extension {extension}')

# add type string so we can more easily track this later   
with open_dict(model._cfg):
    model._cfg.type = args.type
    model._cfg.model_path = os.path.basename(args.output)
    model._cfg.model_origin = args.model
    
print('')
print('###############################################')
print('## Model Config')
print('###############################################')
pprint.pprint(OmegaConf.to_container(model._cfg))
print('')

base_path = os.path.splitext(args.output)[0]
json_path = base_path + '.json'
yaml_path = base_path + '.yaml'

#with open(yaml_path, 'w') as yaml_file:
#  OmegaConf.save(config=model._cfg, f=yaml_file)
#  print('saved model config to {:s}'.format(yaml_path))
  
with open(json_path, 'w') as json_file:
  json.dump(OmegaConf.to_container(model._cfg), json_file, indent=3)
  print('saved model config to {:s}'.format(json_path))
  
model.export(args.output, verbose=True)

print('\nexported {:s} to {:s}'.format(args.model, args.output))


================================================
FILE: scripts/nemo_list_models.py
================================================
#!/usr/bin/env python3
# coding: utf-8

import nemo
import nemo.collections.asr as nemo_asr
import nemo.collections.nlp as nemo_nlp
import nemo.collections.tts as nemo_tts

print('nemo version:', nemo.__version__)

asr_archs = [model for model in dir(nemo_asr.models) if model.endswith("Model")]
nlp_archs = [model for model in dir(nemo_nlp.models) if model.endswith("Model")]
tts_archs = [model for model in dir(nemo_tts.models) if model.endswith("Model")]

print('ASR architectures:', asr_archs)  
print('NLP architectures:', nlp_archs)
print('TTS architectures:', tts_archs)

for asr_arch in asr_archs:
    print('')
    print('#####################################################')
    print('## nemo_asr.models.{:s}'.format(asr_arch))
    print('#####################################################')
    print(getattr(nemo_asr.models, asr_arch).list_available_models())

for nlp_arch in nlp_archs:
    print('')
    print('#####################################################')
    print('## nemo_nlp.models.{:s}'.format(nlp_arch))
    print('#####################################################')
    print(getattr(nemo_nlp.models, nlp_arch).list_available_models())
    
print('')
print('#####################################################')
print('## nemo_nlp.models.pretrained_lm_models')
print('#####################################################')
for model in nemo_nlp.modules.get_pretrained_lm_models_list():
    print(model)

for tts_arch in tts_archs:
    print('')
    print('#####################################################')
    print('## nemo_tts.models.{:s}'.format(tts_arch))
    print('#####################################################')
    print(getattr(nemo_tts.models, tts_arch).list_available_models())

================================================
FILE: scripts/nemo_train_classifier.py
================================================
#!/usr/bin/env python3
# coding: utf-8

import os
import argparse
import torch
import pytorch_lightning as pl

from omegaconf import OmegaConf

from nemo.utils.exp_manager import exp_manager
from nemo.collections import nlp as nemo_nlp

"""
Example SST2 'Stanford Sentiment Treebank' dataset from:
    https://gluebenchmark.com/tasks
    https://dl.fbaipublicfiles.com/glue/data/SST-2.zip
    
Pre-processing commands:
    sed 1d train.tsv > train_nemo_format.tsv
    sed 1d test.tsv > test_nemo_format.tsv
    sed 1d dev.tsv > dev_nemo_format.tsv
"""

# parse args
parser = argparse.ArgumentParser()

parser.add_argument('--dataset', default='datasets/sentiment/SST-2', type=str)
parser.add_argument('--config', default='config/text_classification_config.yaml', type=str)
parser.add_argument('--model', default='distilbert-base-uncased', type=str) # "bert-base-uncased"
parser.add_argument('--classes', default=2, type=int)
parser.add_argument('--epochs', default=5, type=int)
parser.add_argument('--samples', default=-1, type=int)
parser.add_argument('--batch-size', default=32, type=int)
parser.add_argument('--learning-rate', '--lr', default=0.00002, type=float)
parser.add_argument('--max-seq-length', default=128, type=int)

args = parser.parse_args()
print(args)

# load config
config = OmegaConf.load(args.config)
print(f'loaded config from {args.config}')

# setup config
config.model.train_ds.file_path = os.path.join(args.dataset, 'train_nemo_format.tsv')
config.model.validation_ds.file_path = os.path.join(args.dataset, 'dev_nemo_format.tsv')
config.model.test_ds.file_path = os.path.join(args.dataset, 'test_nemo_format.tsv')

config.model.dataset.num_classes=2
config.model.dataset.max_seq_length = args.max_seq_length

config.model.language_model.pretrained_model_name = args.model
config.model.tokenizer.tokenizer_name = args.model

config.model.train_ds.batch_size = args.batch_size
config.model.validation_ds.batch_size = args.batch_size
config.model.test_ds.batch_size = args.batch_size

if args.samples >  0:
    config.model.train_ds.num_samples = args.samples
    config.model.validation_ds.num_samples = args.samples
    config.model.test_ds.num_samples = args.samples

config.model.optim.lr = args.learning_rate

config.trainer.gpus = 1 if torch.cuda.is_available() else 0
config.trainer.precision = 16 if torch.cuda.is_available() else 32  # For mixed precision training, use precision=16 and amp_level=O1
config.trainer.max_epochs = args.epochs
config.trainer.accelerator = None   # Remove distributed training flags

print(OmegaConf.to_yaml(config))

# create trainer + model
trainer = pl.Trainer(**config.trainer)
model   = nemo_nlp.models.TextClassificationModel(config.model, trainer=trainer)
exp_dir = str(exp_manager(trainer, config.get("exp_manager", None)))

print('experiment directory:', exp_dir)

# start the training
trainer.fit(model)

# test the model
eval_checkpoint_path = trainer.checkpoint_callback.best_model_path
eval_model = nemo_nlp.models.TextClassificationModel.load_from_checkpoint(checkpoint_path=eval_checkpoint_path)

print('loaded checkpoint for eval:', eval_checkpoint_path)

eval_model.setup_test_data(test_data_config=config.model.validation_ds)
trainer.test(model=eval_model, ckpt_path=None, verbose=True)

# example inference
queries = [
    'by the end of no such thing the audience , like beatrice , has a watchful affection for the monster .', 
    'director rob marshall went out gunning to make a great one .', 
    'uneasy mishmash of styles and genres .'
]

results = eval_model.classifytext(
            queries=queries, 
            batch_size=len(queries), 
            max_seq_length=config.model.dataset.max_seq_length
        )

pred_intents, pred_slots = eval_model.predict_from_examples(queries, config.model.validation_ds)

print('The prediction results of some sample queries with the trained model:')

for query, result in zip(queries, results):
    print(f'Query : {query}')
    print(f'Predicted label: {result}')
    
print('\ndone training:', exp_dir)


================================================
FILE: scripts/nemo_train_intent.py
================================================
#!/usr/bin/env python3
# coding: utf-8

import os
import argparse
import torch
import pytorch_lightning as pl

from omegaconf import OmegaConf

from nemo.utils.exp_manager import exp_manager
from nemo.collections import nlp as nemo_nlp

"""
Example dataset from:
    https://github.com/xliuhw/NLU-Evaluation-Data
    https://github.com/xliuhw/NLU-Evaluation-Data/archive/master.zip
    
Command used to pre-process the data:

    python3 intent_import_datasets.py \
        --dataset_name=assistant \
        --source_data_dir=datasets/intent/NLU-Evaluation-Data-master \
        --target_data_dir=datasets/intent/NLU-Evaluation-Data-master/nemo_format
"""

# parse args
parser = argparse.ArgumentParser()

parser.add_argument('--dataset', default='data/datasets/NLU-Evaluation-Data-master/nemo_format', type=str)
parser.add_argument('--config', default='data/config/training/intent_slot_classification_config.yaml', type=str)
parser.add_argument('--exp-dir', default='data/nemo_experiments', type=str) 
parser.add_argument('--model', default='distilbert-base-uncased', type=str) # "bert-base-uncased"
parser.add_argument('--epochs', default=5, type=int)
parser.add_argument('--samples', default=-1, type=int)
parser.add_argument('--batch-size', default=32, type=int)
parser.add_argument('--learning-rate', '--lr', default=0.00002, type=float)
parser.add_argument('--max-seq-length', default=50, type=int)

args = parser.parse_args()
print(args)

# load config
config = OmegaConf.load(args.config)
print(f'loaded config from {args.config}')

# setup config
config.model.data_dir = args.dataset #os.path.join(args.dataset, 'nemo_format')

config.model.language_model.max_seq_length = args.max_seq_length
config.model.language_model.pretrained_model_name = args.model
config.model.tokenizer.tokenizer_name = args.model

config.model.train_ds.batch_size = args.batch_size
config.model.validation_ds.batch_size = args.batch_size
config.model.test_ds.batch_size = args.batch_size

if args.samples >  0:
    config.model.train_ds.num_samples = args.samples
    config.model.validation_ds.num_samples = args.samples
    config.model.test_ds.num_samples = args.samples

config.model.optim.lr = args.learning_rate

config.trainer.gpus = 1 if torch.cuda.is_available() else 0
config.trainer.precision = 16 if torch.cuda.is_available() else 32  # For mixed precision training, use precision=16 and amp_level=O1
config.trainer.max_epochs = args.epochs
config.trainer.accelerator = None   # Remove distributed training flags

print(OmegaConf.to_yaml(config))

# create trainer + model
trainer = pl.Trainer(**config.trainer)
model   = nemo_nlp.models.IntentSlotClassificationModel(config.model, trainer=trainer)

# set experiment directory
exp_cfg = config.get('exp_manager', None)
exp_cfg['exp_dir'] = args.exp_dir
exp_dir = str(exp_manager(trainer, exp_cfg))

print('experiment directory:', exp_dir)

# start the training
trainer.fit(model)

# test the model
eval_checkpoint_path = trainer.checkpoint_callback.best_model_path
eval_model = nemo_nlp.models.IntentSlotClassificationModel.load_from_checkpoint(checkpoint_path=eval_checkpoint_path)

print('loaded checkpoint for eval:', eval_checkpoint_path)

eval_model.setup_test_data(test_data_config=config.model.test_ds)
trainer.test(model=eval_model, ckpt_path=None, verbose=True)

# example inference
queries = [
    'set alarm for seven thirty am',
    'lower volume by fifty percent',
    'what is my schedule for tomorrow',
]

pred_intents, pred_slots = eval_model.predict_from_examples(queries, config.model.test_ds)

print('The prediction results of some sample queries with the trained model:')

for query, intent, slots in zip(queries, pred_intents, pred_slots):
    print(f'Query : {query}')
    print(f'Predicted Intent: {intent}')
    print(f'Predicted Slots: {slots}')
    
print('\ndone training:', exp_dir)


================================================
FILE: scripts/nemo_train_ner.py
================================================
#!/usr/bin/env python3
# coding: utf-8

import os
import argparse
import torch
import pytorch_lightning as pl

from omegaconf import OmegaConf

from nemo.utils.exp_manager import exp_manager
from nemo.collections import nlp as nemo_nlp

"""
Example GMB (Groningen Meaning Bank) dataset from:
    https://dldata-public.s3.us-east-2.amazonaws.com/gmb_v_2.2.0_clean.zip
    
This version of the dataset is already pre-processed, but other IOB format 
data can be converted using the ner_import_iob.py tool.
"""

# parse args
parser = argparse.ArgumentParser()

parser.add_argument('--dataset', default='datasets/ner/gmb_v_2.2.0_clean', type=str)
parser.add_argument('--config', default='config/token_classification_config.yaml', type=str)
parser.add_argument('--model', default='distilbert-base-uncased', type=str) # "bert-base-uncased"
parser.add_argument('--epochs', default=5, type=int)
parser.add_argument('--samples', default=-1, type=int)
parser.add_argument('--batch-size', default=32, type=int)
parser.add_argument('--learning-rate', '--lr', default=0.00005, type=float)
parser.add_argument('--max-seq-length', default=128, type=int)

args = parser.parse_args()
print(args)

# load config
config = OmegaConf.load(args.config)
print(f'loaded config from {args.config}')

# setup config
config.model.dataset.data_dir = args.dataset
config.model.dataset.max_seq_length = args.max_seq_length

config.model.language_model.pretrained_model_name = args.model
config.model.tokenizer.tokenizer_name = args.model

config.model.train_ds.batch_size = args.batch_size
config.model.validation_ds.batch_size = args.batch_size
config.model.test_ds.batch_size = args.batch_size

if args.samples >  0:
    config.model.train_ds.num_samples = args.samples
    config.model.validation_ds.num_samples = args.samples
    config.model.test_ds.num_samples = args.samples

config.model.optim.lr = args.learning_rate

config.trainer.gpus = 1 if torch.cuda.is_available() else 0
config.trainer.precision = 16 if torch.cuda.is_available() else 32  # For mixed precision training, use precision=16 and amp_level=O1
config.trainer.max_epochs = args.epochs
config.trainer.accelerator = None   # Remove distributed training flags

print(OmegaConf.to_yaml(config))

# create trainer + model
trainer = pl.Trainer(**config.trainer)
model   = nemo_nlp.models.TokenClassificationModel(config.model, trainer=trainer)
exp_dir = str(exp_manager(trainer, config.get("exp_manager", None)))

print('experiment directory:', exp_dir)

# start the training
trainer.fit(model)

# test the model
eval_checkpoint_path = trainer.checkpoint_callback.best_model_path
eval_model = nemo_nlp.models.TokenClassificationModel.load_from_checkpoint(checkpoint_path=eval_checkpoint_path)

print('loaded checkpoint for eval:', eval_checkpoint_path)

eval_model.setup_test_data(test_data_config=config.model.test_ds)
trainer.test(model=eval_model, ckpt_path=None, verbose=True)

# example inference
eval_model.evaluate_from_file(
    text_file=os.path.join(args.dataset, 'text_dev.txt'),
    labels_file=os.path.join(args.dataset, 'labels_dev.txt'),
    output_dir=exp_dir,
)
    
print('\ndone training:', exp_dir)


================================================
FILE: scripts/nemo_train_qa.py
================================================
#!/usr/bin/env python3
# coding: utf-8

import os
import argparse
import torch
import pytorch_lightning as pl

from omegaconf import OmegaConf

from nemo.utils.exp_manager import exp_manager
from nemo.collections import nlp as nemo_nlp


# parse args
parser = argparse.ArgumentParser()

parser.add_argument('--dataset', default='datasets/squad', type=str)
parser.add_argument('--dataset-version', default='v1.1', type=str)
parser.add_argument('--config', default='config/question_answering_squad_config.yaml', type=str)
parser.add_argument('--model', default='distilbert-base-uncased', type=str) # "bert-base-uncased"
parser.add_argument('--epochs', default=1, type=int)
parser.add_argument('--samples', default=-1, type=int) # 5000
parser.add_argument('--batch-size', default=12, type=int)
parser.add_argument('--learning-rate', '--lr', default=0.00003, type=float)
parser.add_argument('--max-seq-length', default=384, type=int)
parser.add_argument('--output', default='', type=str) # defaults to ./nemo_experiments

args = parser.parse_args()
print(args)

# load config
config = OmegaConf.load(args.config)
print(f'loaded config from {args.config}')

# setup config
config.model.train_ds.file = os.path.join(args.dataset, args.dataset_version, f'train-{args.dataset_version}.json')
config.model.validation_ds.file = os.path.join(args.dataset, args.dataset_version, f'dev-{args.dataset_version}.json')
config.model.test_ds.file = config.model.validation_ds.file

config.model.language_model.pretrained_model_name = args.model
config.model.tokenizer.tokenizer_name = args.model
config.model.dataset.max_seq_length = args.max_seq_length

if config.model.dataset.doc_stride >= config.model.dataset.max_seq_length:
    config.model.dataset.doc_stride = int(config.model.dataset.max_seq_length / 2)
    
config.model.train_ds.batch_size = args.batch_size
config.model.validation_ds.batch_size = args.batch_size
config.model.test_ds.batch_size = args.batch_size

if args.samples >  0:
    config.model.train_ds.num_samples = args.samples
    config.model.validation_ds.num_samples = args.samples
    config.model.test_ds.num_samples = args.samples

config.model.optim.lr = args.learning_rate

config.trainer.gpus = 1 if torch.cuda.is_available() else 0
config.trainer.precision = 16 if torch.cuda.is_available() else 32  # For mixed precision training, use precision=16 and amp_level=O1
config.trainer.max_epochs = args.epochs
config.trainer.accelerator = None   # Remove distributed training flags

if args.output != '':
    config.exp_manager.exp_dir = args.output

print(OmegaConf.to_yaml(config))


# create trainer + model
trainer = pl.Trainer(**config.trainer)
model   = nemo_nlp.models.QAModel(cfg=config.model, trainer=trainer)
exp_dir = str(exp_manager(trainer, config.get("exp_manager", None)))

print('experiment directory:', exp_dir)

# start the training
trainer.fit(model)

# test the model
model.setup_test_data(test_data_config=config.model.test_ds)
trainer.test(model)

# example inference
all_preds, all_nbests = model.inference(file=config.model.test_ds.file, 
                                        output_nbest_file=os.path.join(exp_dir, 'output_prediction.json'),
                                        output_prediction_file=os.path.join(exp_dir, 'output_nbest.json'),
                                        batch_size=args.batch_size, 
                                        num_samples=10)

for _, item in all_preds.items():
    print(f"question: {item[0]} answer: {item[1]}")
    
print('\ndone training:', exp_dir)


================================================
FILE: scripts/os_version.sh
================================================
#!/usr/bin/env bash

ARCH=$(uname -i)
echo "ARCH:  $ARCH"

if [ $ARCH = "aarch64" ]; then
	L4T_VERSION_STRING=$(head -n 1 /etc/nv_tegra_release)

	if [ -z "$L4T_VERSION_STRING" ]; then
		echo "reading L4T version from \"dpkg-query --show nvidia-l4t-core\""

		L4T_VERSION_STRING=$(dpkg-query --showformat='${Version}' --show nvidia-l4t-core)
		L4T_VERSION_ARRAY=(${L4T_VERSION_STRING//./ })	

		#echo ${L4T_VERSION_ARRAY[@]}
		#echo ${#L4T_VERSION_ARRAY[@]}

		L4T_RELEASE=${L4T_VERSION_ARRAY[0]}
		L4T_REVISION=${L4T_VERSION_ARRAY[1]}
	else
		echo "reading L4T version from /etc/nv_tegra_release"

		L4T_RELEASE=$(echo $L4T_VERSION_STRING | cut -f 2 -d ' ' | grep -Po '(?<=R)[^;]+')
		L4T_REVISION=$(echo $L4T_VERSION_STRING | cut -f 2 -d ',' | grep -Po '(?<=REVISION: )[^;]+')
	fi

	L4T_REVISION_MAJOR=${L4T_REVISION:0:1}
	L4T_REVISION_MINOR=${L4T_REVISION:2:1}

	L4T_VERSION="$L4T_RELEASE.$L4T_REVISION"

	echo "L4T BSP Version:  L4T R$L4T_VERSION"
fi


================================================
FILE: scripts/record_mic.py
================================================
#!/usr/bin/env python3
# coding: utf-8

import sys
import signal
import argparse

from jetson_voice import AudioInput, list_audio_devices
from soundfile import SoundFile


parser = argparse.ArgumentParser()

parser.add_argument('--mic', default=None, type=str, required=True, help='device name or number of input microphone')
parser.add_argument('--output', default=None, type=str, required=True, help='path to output wav/ogg/flac file')
parser.add_argument('--sample-rate', default=16000, type=int, help='sample rate (in Hz)')
parser.add_argument('--list-devices', action='store_true', help='list audio input devices')

args = parser.parse_args()
print(args)

# list audio devices
if args.list_devices:
    list_audio_devices()
    sys.exit()
    
# setup exit signal handler        
record = True

def signal_handler(sig, frame):
    global record
    record = False
    print('Ctrl+C recieved, exiting...')
    
signal.signal(signal.SIGINT, signal_handler)

# create the output wav
output_wav = SoundFile(args.output, mode='w', samplerate=args.sample_rate, channels=1)

# create the audio device
input_mic = AudioInput(mic=args.mic, sample_rate=args.sample_rate, chunk_size=4096)
        
# loop until user exits
sample_count = 0

while record:
    samples = input_mic.next()
    output_wav.write(samples)
    sample_count += len(samples)

output_wav.close()
print(f"saved {sample_count / args.sample_rate:.2f} seconds of audio to '{args.output}'")


================================================
FILE: scripts/start_jupyter.sh
================================================
#!/usr/bin/env bash

jupyter lab --ip 0.0.0.0 --port 8888 --allow-root &> /var/log/jupyter.log

echo "allow 10 sec for JupyterLab to start @ http://$(hostname -I | cut -d' ' -f1):8888 (password nvidia)"
echo "JupterLab logging location:  /var/log/jupyter.log  (inside the container)"


================================================
FILE: tests/run_tests.py
================================================
#!/usr/bin/env python3
# coding: utf-8

import os
import sys
import json
import logging
import argparse
import datetime
import subprocess

parser = argparse.ArgumentParser()

parser.add_argument('--log-dir', default='', type=str, help='directory to save log files under')
parser.add_argument('--tests', default='data/tests/tests.json', type=str, help='path to config file of tests')
parser.add_argument('--model', default='', type=str, help='if specified, only run tests that use this model')
parser.add_argument('--module', default='', type=str, help='if specified, only run tests that use this module')
parser.add_argument('--config', default='', type=str, help='if specified, only run tests that use this test config')
parser.add_argument('--generate', action='store_true', help='generate the expected outputs')

args = parser.parse_args()

if args.log_dir == '':
    args.log_dir = os.path.join('data/tests/logs', datetime.datetime.now().strftime("%Y%m%d_%H%M"))
    
if not os.path.exists(args.log_dir):
    os.makedirs(args.log_dir)

print(args)

# wrapper for launching test processes
def run_test(module, model, config, args=None, log_dir=None):
    config = os.path.join('data/tests', config)
    cmd = f"python3 tests/{module} --model {model} --config {config}"
    
    if args:
        cmd += ' ' + args
       
    print("\nrunning test:\n\t$", cmd, "\n")  

    if log_dir:
        tee = f"tee {os.path.join(log_dir, os.path.splitext(os.path.basename(module))[0])}_{model}.txt"
        cmd = f"mkfifo pipe; {tee} < pipe & {cmd} > pipe; code=$?; rm pipe; exit $code" # https://stackoverflow.com/a/1221844

    results = subprocess.run(cmd, shell=True)
    
    if results.returncode == 0:
        status = 'PASSED'
    elif results.returncode == 127:
        status = 'GENERATED'
    else:
        status = 'FAILED'
        
    print(f"\n{status} TEST {module} ({model}) - return code {results.returncode}\n")
    return status
    
# load the config containing all the tests
with open(args.tests) as config_file:
    test_config = json.load(config_file)

# filter the tests if requested
def filter_test(test):
    if args.model != '' and args.model != test['model']:
        return False
        
    if args.module != '' and args.module != test['module']:
        return False
        
    if args.config != '' and args.config != test['config']:
        return False
        
    return True
        
test_config = [test for test in test_config if filter_test(test)]

# run the tests
for test in test_config:
    test_args = test.get('args', '')
    
    if args.generate:
        test_args += ' --generate'
        
    status = run_test(test['module'], test['model'], test['config'], test_args, args.log_dir)
    
    # if the test needed to generate the expected outputs, run it again
    if status == 'GENERATED':
        print('generated expected outputs, running test again...')
        status = run_test(test['module'], test['model'], test['config'], test.get('args'), args.log_dir)
     
    test['status'] = status

# test summary
passed = 0

print('')
print('----------------------------------------------------')
print(' TEST SUMMARY')
print('----------------------------------------------------')

for test in test_config:
    test_str = f"{test['module']} ({test['model']})"
    print(f"{test_str:<40} {test['status']}")
    
    if test['status'] == 'PASSED':
        passed += 1
        
print(f"\npassed {passed} of {len(test_config)} tests")
print(f"saved logs to {args.log_dir}")


================================================
FILE: tests/test_asr.py
================================================
#!/usr/bin/env python3
# coding: utf-8

import os
import sys
import json
import nltk
import logging

from jetson_voice import ASR, AudioInput, ConfigArgParser


parser = ConfigArgParser()

parser.add_argument('--model', default='quartznet', type=str, help='path to model, service name, or json config file')
parser.add_argument('--config', type=str, required=True, help='path to test config file')
parser.add_argument('--threshold', type=int, default=0, help='threshold for comparing actual vs expected outputs')
parser.add_argument('--generate', action='store_true', help='generate the expected outputs')

args = parser.parse_args()
print(args)


print('')
print('----------------------------------------------------')
print(' RUNNING TEST (ASR)')
print('----------------------------------------------------')
print(f'   model:  {args.model}')
print(f'   config: {args.config}')
print('')

# load test config
with open(args.config) as config_file:
    test_config = json.load(config_file)

# load the model
asr = ASR(args.model)

# list of (passed, num_outputs) tuples
test_results = []

# run tests
for test in test_config:
    stream = AudioInput(wav=test['wav'], 
                         sample_rate=asr.sample_rate, 
                         chunk_size=asr.chunk_size)

    outputs = []
    
    for samples in stream:
        output = asr(samples)
        
        if asr.classification:
            print(f"class '{output[0]}' ({output[1]:.3f})")
            outputs.append(output[0])
        else:
            for transcript in output:
                print(transcript['text'])
                
                if transcript['end']:
                    print('')
                    outputs.append(transcript['text'])

    if not asr.classification:
        if not transcript['end']: # pick up the last transcript
            outputs.append(transcript['text'])
            
    if 'outputs' not in test:
        test['outputs'] = {}
    
    if args.model not in test['outputs']:
        args.generate = True
        
    if args.generate:
        test['outputs'][args.model] = outputs
    else:
        expected_outputs = test['outputs'][args.model]
        
        if len(outputs) != len(expected_outputs):
            logging.error(f"failed test '{test['wav']}' - got {len(outputs)} outputs (expected {len(expected_outputs)})")
            test_results.append((0, len(expected_outputs)))
            continue
        
        passed = 0
        
        for i in range(len(expected_outputs)):
            similarity = nltk.edit_distance(expected_outputs[i], outputs[i])
            
            if similarity > args.threshold:
                logging.error(f"failed test '{test['wav']}' - similarity {similarity} exceeded threshold of {args.threshold}")
                logging.error( "  expected:  '{expected_outputs[i]}'")
                logging.error( "  actual:    '{outputs[i]}'")
            else:
                passed += 1
                
        test_results.append((passed, len(expected_outputs)))

if args.generate:
    print('')
    logging.info(f"generated expected outputs, saving to '{args.config}'")
    
    with open(args.config, 'w') as config_file:
        json.dump(test_config, config_file, indent=3)
        
    sys.exit(127)

# test summary
passed_tests = 0
passed_outputs = 0
total_outputs = 0

for passed, num_outputs in test_results:
    if passed == num_outputs:
        passed_tests += 1
        
    passed_outputs += passed
    total_outputs += num_outputs

print('')
print('----------------------------------------------------')
print(' TEST RESULTS (ASR)')
print('----------------------------------------------------')
print(f'   model:  {args.model}')
print(f'   config: {args.config}')
print(f'   passed: {passed_tests} / {len(test_config)} audio files')
print(f'           {passed_outputs} / {total_outputs} outputs')
print('')

if passed_tests != len(test_config):
    logging.error(f"failed test '{args.config}' with model '{args.model}'")
    sys.exit(1)


================================================
FILE: tests/test_nlp.py
================================================
#!/usr/bin/env python3
# coding: utf-8

import os
import sys
import json
import nltk
import pprint
import logging

from jetson_voice import NLP, ConfigArgParser


parser = ConfigArgParser()

parser.add_argument('--model', default='distilbert_qa_128', type=str, help='path to model, service name, or json config file')
parser.add_argument('--config', type=str, required=True, help='path to test config file')
parser.add_argument('--threshold', type=int, default=0, help='threshold for comparing actual vs expected outputs')
parser.add_argument('--generate', action='store_true', help='generate the expected outputs')

args = parser.parse_args()
print(args)

print('')
print('----------------------------------------------------')
print(f' RUNNING TEST (NLP)')
print('----------------------------------------------------')
print(f'   model:  {args.model}')
print(f'   config: {args.config}')
print('')

# load test config
with open(args.config) as config_file:
    test_config = json.load(config_file)

# load the model
model = NLP(args.model)
type = model.config.type

"""
if args.type == 'intent_slot':
    model = IntentSlot(args.model)
elif args.type == 'qa':
    model = QuestionAnswer(args.model)
elif args.type == 'text_classification':
    model = TextClassification(args.model)
elif args.type == 'token_classification':
    model = TokenClassification(args.model)
"""
   
# list of (passed, num_outputs) tuples
test_results = []

# run tests
for test in test_config:
    outputs = []
    
    if type == 'intent_slot':
        for query in test['queries']:
            results = model(query)
            
            print('')
            print('query:', query, '\n')
            pprint.pprint(results)
            print('')
            
            result_str = results['intent']
            
            for slot in results['slots']:
                result_str += f" {slot['slot']}={slot['text']}"
                
            outputs.append(result_str)
            
    elif type == 'qa':
        for question in test['questions']:
            query = {
                'question': question,
                'context': test['context']
            }
            
            answer = model(query, top_k=1)
            
            print('\n')
            print('context:', query['context'])
            print('')
            print('question:', query['question'])
            print('')
            print('answer:', answer['answer'])
            print('score: ', answer['score'])
            
            outputs.append(answer['answer'])
    
    elif type == 'text_classification':
        for query in test['queries']:
            results = model(query)
            
            print('')
            print('query:', query, '\n')
            pprint.pprint(results)
            print('')
            
            outputs.append(results['label'])
    
    elif type == 'token_classification':
        for query in test['queries']:
            results = model(query)
            result_str = model.tag_string(query, results)
            
            print('')
            print('query:', query, '\n')
            print(model.tag_string(query, results, scores=True))
            print('')
            
            outputs.append(result_str)
            
    if 'outputs' not in test:
        test['outputs'] = {}
    
    if args.model not in test['outputs']:
        args.generate = True
        
    if args.generate:
        test['outputs'][args.model] = outputs
    else:
        expected_outputs = test['outputs'][args.model]
        
        if len(outputs) != len(expected_outputs):
            logging.error(f"failed test '{test['wav']}' - got {len(outputs)} outputs (expected {len(expected_outputs)})")
            test_results.append((0, len(expected_outputs)))
            continue
        
        passed = 0
        
        for i in range(len(expected_outputs)):
            similarity = nltk.edit_distance(expected_outputs[i], outputs[i])
            
            if similarity > args.threshold:
                logging.error(f"failed test - similarity {similarity} exceeded threshold of {args.threshold}")
                logging.error( "  expected:  '{expected_outputs[i]}'")
                logging.error( "  actual:    '{outputs[i]}'")
            else:
                passed += 1
                
        test_results.append((passed, len(expected_outputs)))

if args.generate:
    print('')
    logging.info(f"generated expected outputs, saving to '{args.config}'")
    
    with open(args.config, 'w') as config_file:
        json.dump(test_config, config_file, indent=3)
        
    sys.exit(127)

# test summary
passed_tests = 0
passed_outputs = 0
total_outputs = 0

for passed, num_outputs in test_results:
    if passed == num_outputs:
        passed_tests += 1
        
    passed_outputs += passed
    total_outputs += num_outputs

print('')
print('----------------------------------------------------')
print(f' TEST RESULTS (NLP)')
print('----------------------------------------------------')
print(f'   model:  {args.model}')
print(f'   config: {args.config}')
print(f'   type:   {type}')
print(f'   passed: {passed_tests} / {len(test_config)} tests')
print(f'           {passed_outputs} / {total_outputs} queries')
print('')

if passed_tests != len(test_config):
    logging.error(f"failed test '{args.config}' with model '{args.model}'")
    sys.exit(1)


================================================
FILE: tests/test_tts.py
================================================
#!/usr/bin/env python3
# coding: utf-8

import os
import sys
import json
import librosa
import logging
import datetime

from jetson_voice import TTS, ConfigArgParser
from soundfile import SoundFile

parser = ConfigArgParser()

parser.add_argument('--model', default='fastpitch_hifigan', type=str, help='path to model, service name, or json config file')
parser.add_argument('--config', type=str, required=True, help='path to test config file')
parser.add_argument('--rms-threshold', type=float, default=0.005, help='threshold for comparing actual vs expected RMS')
parser.add_argument('--length-threshold', type=float, default=0.1, help='threshold for comparing actual vs expected audio length (in seconds)')
parser.add_argument('--generate', action='store_true', help='generate the expected outputs')
parser.add_argument("--output-dir", default='', help='output directory to save generated audio')

args = parser.parse_args()

if args.output_dir == '':
    args.output_dir = os.path.join('data/tests/tts', args.model, datetime.datetime.now().strftime("%Y%m%d_%H%M"))
    
if not os.path.exists(args.output_dir):
    os.makedirs(args.output_dir)
    
print(args)

print('')
print('----------------------------------------------------')
print(' RUNNING TEST (TTS)')
print('----------------------------------------------------')
print(f'   model:  {args.model}')
print(f'   config: {args.config}')
print('')

# load test config
with open(args.config) as config_file:
    test_config = json.load(config_file)

# load the model
tts = TTS(args.model)

# list of (passed, num_outputs) tuples
passed = 0

# run tests
for idx, test in enumerate(test_config):
    audio = tts(test['text'])
    
    wav_path = os.path.join(args.output_dir, f"{idx}.wav")
    wav = SoundFile(wav_path, mode='w', samplerate=tts.sample_rate, channels=1)
    wav.write(audio)
    wav.close()
    
    actual_length = len(audio) / tts.sample_rate
    actual_rms = float(librosa.feature.rms(y=audio, frame_length=len(audio), center=False)[0][0])
    
    print(f"'{test['text']}'")
    print(f"audio length = {actual_length}s, RMS = {actual_rms}")
    print(f"saved audio to '{wav_path}'\n")
    
    if 'outputs' not in test:
        test['outputs'] = {}
    
    if args.model not in test['outputs']:
        args.generate = True
        
    if args.generate:
        test['outputs'][args.model] = (actual_length, actual_rms)
    else:
        expected_length, expected_rms = test['outputs'][args.model]
        
        length_diff = abs(expected_length - actual_length)
        rms_diff = abs(expected_rms - actual_rms)
        
        if length_diff > args.length_threshold:
            logging.error(f"failed test - length difference of {length_diff}s exceeded threshold of {args.length_threshold} (actual={actual_length}s, expected={expected_length}s)")
            logging.error(f"              '{test['text']}'")
            continue
            
        if rms_diff > args.rms_threshold:
            logging.error(f"failed test - RMS difference of {rms_diff} exceeded threshold of {args.rms_threshold} (actual={actual_rms}, expected={expected_rms})")
            logging.error(f"              '{test['text']}'")
            continue
        
        passed += 1

if args.generate:
    print('')
    logging.info(f"generated expected outputs, saving to '{args.config}'")
    
    with open(args.config, 'w') as config_file:
        json.dump(test_config, config_file, indent=3)
        
    sys.exit(127)

# test summary
print('')
print('----------------------------------------------------')
print(' TEST RESULTS (TTS)')
print('----------------------------------------------------')
print(f'   model:  {args.model}')
print(f'   config: {args.config}')
print(f'   passed: {passed} / {len(test_config)}')
print('')

if passed != len(test_config):
    logging.error(f"failed test '{args.config}' with model '{args.model}'")
    sys.exit(1)