Repository: dusty-nv/jetson-voice Branch: master Commit: c6a8c9552c70 Files: 115 Total size: 749.0 KB Directory structure: gitextract_8xzz9c2n/ ├── .dockerignore ├── .gitignore ├── .gitmodules ├── Dockerfile.aarch64 ├── Dockerfile.ros ├── Dockerfile.runtime ├── Dockerfile.x86_64 ├── README.md ├── docker/ │ ├── build.sh │ ├── push.sh │ ├── run.sh │ └── tag.sh ├── examples/ │ ├── asr.py │ ├── assistant.py │ ├── nlp.py │ ├── nlp_qa.py │ └── tts.py ├── jetson_voice/ │ ├── __init__.py │ ├── asr.py │ ├── auto.py │ ├── backends/ │ │ ├── onnxruntime/ │ │ │ ├── __init__.py │ │ │ └── ort_model.py │ │ ├── riva/ │ │ │ ├── __init__.py │ │ │ ├── riva_asr.py │ │ │ └── riva_tts.py │ │ └── tensorrt/ │ │ ├── __init__.py │ │ ├── trt_binding.py │ │ ├── trt_builder.py │ │ └── trt_model.py │ ├── models/ │ │ ├── __init__.py │ │ ├── asr/ │ │ │ ├── __init__.py │ │ │ ├── asr_engine.py │ │ │ ├── ctc_beamsearch.py │ │ │ ├── ctc_decoder.py │ │ │ ├── ctc_greedy.py │ │ │ └── ctc_utils.py │ │ ├── nlp/ │ │ │ ├── __init__.py │ │ │ ├── intent_slot.py │ │ │ ├── nlp_utils.py │ │ │ ├── question_answer.py │ │ │ ├── text_classification.py │ │ │ └── token_classification.py │ │ └── tts/ │ │ ├── __init__.py │ │ └── tts_engine.py │ ├── nlp.py │ ├── tts.py │ └── utils/ │ ├── __init__.py │ ├── audio.py │ ├── config.py │ ├── resource.py │ └── softmax.py ├── patches/ │ ├── nemo/ │ │ ├── 1.0.0rc1/ │ │ │ ├── exportable.original.py │ │ │ ├── exportable.py │ │ │ ├── nlp/ │ │ │ │ ├── __init__.py │ │ │ │ ├── distilbert.diff │ │ │ │ ├── distilbert.original.py │ │ │ │ ├── distilbert.py │ │ │ │ ├── huggingface_utils.py │ │ │ │ ├── location.txt │ │ │ │ └── mobilebert.py │ │ │ ├── setup.original.py │ │ │ └── setup.py │ │ └── 1.6.2/ │ │ ├── requirements.original.txt │ │ ├── requirements.txt │ │ ├── requirements_nlp.original.txt │ │ └── requirements_nlp.txt │ ├── pytorch/ │ │ ├── 1.6.0/ │ │ │ ├── functional.diff │ │ │ ├── functional.original.py │ │ │ └── functional.py │ │ └── 1.7.0/ │ │ ├── functional.diff │ │ ├── functional.original.py │ │ └── functional.py │ └── transformers/ │ ├── 4.5.0/ │ │ ├── convert_graph_to_onnx.diff │ │ ├── convert_graph_to_onnx.original.py │ │ ├── convert_graph_to_onnx.py │ │ └── modeling_distilbert.py │ └── 4.5.1/ │ ├── convert_graph_to_onnx.diff │ ├── convert_graph_to_onnx.original.py │ ├── convert_graph_to_onnx.py │ ├── modeling_distilbert.diff │ ├── modeling_distilbert.original.py │ └── modeling_distilbert.py ├── ros/ │ ├── CMakeLists.txt │ ├── jetson_voice_ros/ │ │ ├── __init__.py │ │ ├── asr.py │ │ ├── audio_input.py │ │ ├── audio_output.py │ │ ├── nlp_intent_slot.py │ │ ├── nlp_question_answer.py │ │ └── tts.py │ ├── launch/ │ │ ├── asr.launch.py │ │ ├── audio_playback.launch.py │ │ └── tts.launch.py │ ├── msg/ │ │ ├── Audio.msg │ │ ├── AudioInfo.msg │ │ ├── IntentSlot.msg │ │ ├── QuestionAnswerQuery.msg │ │ ├── QuestionAnswerResult.msg │ │ └── Slot.msg │ └── package.xml ├── scripts/ │ ├── list_audio_devices.py │ ├── list_models.py │ ├── nemo_export_onnx.py │ ├── nemo_list_models.py │ ├── nemo_train_classifier.py │ ├── nemo_train_intent.py │ ├── nemo_train_ner.py │ ├── nemo_train_qa.py │ ├── os_version.sh │ ├── record_mic.py │ └── start_jupyter.sh └── tests/ ├── run_tests.py ├── test_asr.py ├── test_nlp.py └── test_tts.py ================================================ FILE CONTENTS ================================================ ================================================ FILE: .dockerignore ================================================ data/ .git .cache ================================================ FILE: .gitignore ================================================ data/ logs/ packages/ # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] *$py.class # C extensions *.so # Distribution / packaging .Python build/ develop-eggs/ dist/ downloads/ eggs/ .eggs/ lib/ lib64/ parts/ sdist/ var/ wheels/ pip-wheel-metadata/ share/python-wheels/ *.egg-info/ .installed.cfg *.egg MANIFEST # PyInstaller # Usually these files are written by a python script from a template # before PyInstaller builds the exe, so as to inject date/other infos into it. *.manifest *.spec # Installer logs pip-log.txt pip-delete-this-directory.txt # Unit test / coverage reports htmlcov/ .tox/ .nox/ .coverage .coverage.* .cache nosetests.xml coverage.xml *.cover *.py,cover .hypothesis/ .pytest_cache/ # Translations *.mo *.pot # Django stuff: *.log local_settings.py db.sqlite3 db.sqlite3-journal # Flask stuff: instance/ .webassets-cache # Scrapy stuff: .scrapy # Sphinx documentation docs/_build/ # PyBuilder target/ # Jupyter Notebook .ipynb_checkpoints # IPython profile_default/ ipython_config.py # pyenv .python-version # pipenv # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. # However, in case of collaboration, if having platform-specific dependencies or dependencies # having no cross-platform support, pipenv may install dependencies that don't work, or not # install all needed dependencies. #Pipfile.lock # PEP 582; used by e.g. github.com/David-OConnor/pyflow __pypackages__/ # Celery stuff celerybeat-schedule celerybeat.pid # SageMath parsed files *.sage.py # Environments .env .venv env/ venv/ ENV/ env.bak/ venv.bak/ # Spyder project settings .spyderproject .spyproject # Rope project settings .ropeproject # mkdocs documentation /site # mypy .mypy_cache/ .dmypy.json dmypy.json # Pyre type checker .pyre/ ================================================ FILE: .gitmodules ================================================ [submodule "docker/containers"] path = docker/containers url = https://github.com/dusty-nv/jetson-containers ================================================ FILE: Dockerfile.aarch64 ================================================ # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a # copy of this software and associated documentation files (the "Software"), # to deal in the Software without restriction, including without limitation # the rights to use, copy, modify, merge, publish, distribute, sublicense, # and/or sell copies of the Software, and to permit persons to whom the # Software is furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in # all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER # DEALINGS IN THE SOFTWARE. ARG BASE_IMAGE FROM ${BASE_IMAGE} ENV DEBIAN_FRONTEND=noninteractive ENV SHELL /bin/bash ENV LANG='en_US.UTF-8' LANGUAGE='en_US:en' LC_ALL='en_US.UTF-8' ARG MAKEFLAGS=-j$(nproc) ARG WORKSPACE=/jetson-voice WORKDIR ${WORKSPACE} # alias python3 -> python RUN rm /usr/bin/python && \ ln -s /usr/bin/python3 /usr/bin/python && \ ln -s /usr/bin/pip3 /usr/bin/pip ################################################################ ## tokenizers/transformers ################################################################ RUN apt-get update && \ apt-get install -y --no-install-recommends \ cmake \ curl \ pkg-config \ protobuf-compiler \ libprotoc-dev \ nano \ tzdata \ libssl-dev \ && rm -rf /var/lib/apt/lists/* \ && apt-get clean # install sentencepiece RUN git clone https://github.com/google/sentencepiece && \ cd sentencepiece && \ mkdir build && \ cd build && \ cmake .. && \ make -j $(nproc) && \ make install && \ ldconfig -v && \ cd .. && \ cd python && \ python3 setup.py install --verbose && \ cd ../../ && \ rm -r -f sentencepiece # install rust (used by tokenizers) RUN curl https://sh.rustup.rs -sSf | sh -s -- -y ENV PATH="/root/.cargo/bin:${PATH}" RUN rustc --version && \ pip3 install setuptools-rust # install tokenizers RUN pip3 install tokenizers --verbose # Apache arrow is needed by datasets package ('pip install pyarrow' is broken, so built from source) # https://github.com/apache/arrow/blob/master/docs/source/developers/python.rst#using-pip # https://raspberrypi.stackexchange.com/a/117723 RUN apt-get update && \ apt-get install -y --no-install-recommends \ libjemalloc-dev \ libboost-dev \ libboost-filesystem-dev \ libboost-system-dev \ libboost-regex-dev \ autoconf \ flex \ bison \ && rm -rf /var/lib/apt/lists/* \ && apt-get clean RUN git clone --branch apache-arrow-3.0.0 https://github.com/apache/arrow.git && \ cd arrow/cpp && \ mkdir build && \ cd build && \ export ARROW_HOME=/usr/local && \ cmake \ -DCMAKE_INSTALL_PREFIX=$ARROW_HOME \ -DCMAKE_INSTALL_LIBDIR=lib \ -DARROW_WITH_BZ2=ON \ -DARROW_WITH_ZLIB=ON \ -DARROW_WITH_ZSTD=ON \ -DARROW_WITH_LZ4=ON \ -DARROW_WITH_SNAPPY=ON \ -DARROW_PARQUET=ON \ -DARROW_CUDA=ON \ -DARROW_PYTHON=ON \ -DARROW_BUILD_TESTS=OFF \ .. && \ make -j$(nproc) && \ make install && \ cd ../../python && \ python3 setup.py build_ext --build-type=release --with-parquet --with-cuda --verbose && \ python3 setup.py install --verbose && \ cd ../../ && \ rm -r -f arrow RUN pip3 show pyarrow && \ python3 -c "import pyarrow" && \ python3 -c "from pyarrow import cuda" # install huggingface (locked to 4.5.1, which the patches are based on) # datasets package is needed to run the huggingface examples RUN pip3 install transformers==4.5.1 datasets --verbose ################################################################ ## onnx / onnxruntime / onnx-graphsurgeon ################################################################ ARG ONNXRUNTIME_URL=https://nvidia.box.com/shared/static/ukszbm1iklzymrt54mgxbzjfzunq7i9t.whl ARG ONNXRUNTIME_WHL=onnxruntime_gpu-1.7.0-cp36-cp36m-linux_aarch64.whl RUN wget --quiet --show-progress --progress=bar:force:noscroll --no-check-certificate ${ONNXRUNTIME_URL} -O ${ONNXRUNTIME_WHL} && \ pip3 install ${ONNXRUNTIME_WHL} --verbose && \ pip3 install onnx psutil sympy --verbose && \ rm ${ONNXRUNTIME_WHL} # install onnx-graphsurgeon RUN cd /opt && \ git clone --recursive https://github.com/nvidia/tensorrt tensorrt && \ cd tensorrt/tools/onnx-graphsurgeon && \ python3 setup.py install --verbose && \ cd ../../../ && \ rm -r -f tensorrt ################################################################ ## NeMo ################################################################ RUN apt-get update && \ apt-get install -y --no-install-recommends \ libopencc-dev \ python3-tk \ libmecab-dev \ mecab \ && rm -rf /var/lib/apt/lists/* \ && apt-get clean RUN cd /opt && \ git clone --recursive --branch v0.11.1 https://github.com/pytorch/text torchtext && \ cd torchtext && \ python3 setup.py clean install RUN pip3 show torch torchvision torchaudio torchtext # clone/build nemo ARG NEMO_VERSION RUN cd /opt && git clone --recursive --branch v${NEMO_VERSION} https://github.com/nvidia/nemo # needed for nemo 1.0 #COPY patches/nemo/${NEMO_VERSION}/setup.py /opt/nemo/setup.py # needed for nemo 1.6 COPY patches/nemo/${NEMO_VERSION}/requirements.txt /opt/nemo/requirements/requirements.txt COPY patches/nemo/${NEMO_VERSION}/requirements_nlp.txt /opt/nemo/requirements/requirements_nlp.txt RUN pip3 install -r /opt/nemo/requirements/requirements.txt --verbose RUN pip3 install -r /opt/nemo/requirements/requirements_asr.txt --verbose RUN pip3 install -r /opt/nemo/requirements/requirements_nlp.txt --verbose RUN pip3 install -r /opt/nemo/requirements/requirements_tts.txt --verbose #RUN pip3 install omegaconf==2.1.0dev24 --verbose RUN cd /opt/nemo && python3 setup.py install --verbose ################################################################ ## ctc-decoders ################################################################ RUN apt-get update && \ apt-get install -y --no-install-recommends \ swig \ && rm -rf /var/lib/apt/lists/* \ && apt-get clean RUN git clone https://github.com/dusty-nv/OpenSeq2Seq -b ctc-decoders && \ cd OpenSeq2Seq/decoders && \ ./setup.sh RUN pip3 install git+https://github.com/NVIDIA/dllogger RUN pip3 install nltk ################################################################ ## Riva GRPC ################################################################ ARG RIVA_URL=https://nvidia.box.com/shared/static/cu8z4t1n6shkxl6z5nh9hpkpn9yxomcz.whl ARG RIVA_WHL=riva_api-1.0.0ea-py3-none-any.whl RUN wget --quiet --show-progress --progress=bar:force:noscroll --no-check-certificate ${RIVA_URL} -O ${RIVA_WHL} && \ pip3 install ${RIVA_WHL} --verbose && \ rm ${RIVA_WHL} ################################################################ ## install some audio stuff ################################################################ RUN apt-get update && \ apt-get install -y --no-install-recommends \ alsa-base \ libasound2-dev \ alsa-utils \ portaudio19-dev \ libsndfile1 \ unzip \ && rm -rf /var/lib/apt/lists/* \ && apt-get clean RUN pip3 install soundfile pyaudio wave ################################################################ ## various patches to install ################################################################ #COPY patches patches #RUN PYTHON_ROOT=`pip3 show torch | grep Location: | cut -d' ' -f2` && \ # PYTORCH_VERSION=`pip3 show torch | grep Version: | cut -d' ' -f2` && \ # TRANSFORMERS_VERSION=`pip3 show transformers | grep Version: | cut -d' ' -f2` && \ # NEMO_PATH="$PYTHON_ROOT/nemo_toolkit-${NEMO_VERSION}-py3.6.egg/nemo" && \ # echo "Python package root path: $PYTHON_ROOT" && \ # echo "Applying patches for PyTorch $PYTORCH_VERSION" && \ # echo "Applying patches for transformers $TRANSFORMERS_VERSION" && \ # cp patches/pytorch/$PYTORCH_VERSION/functional.py $PYTHON_ROOT/torch/functional.py && \ # cp patches/transformers/$TRANSFORMERS_VERSION/convert_graph_to_onnx.py $PYTHON_ROOT/transformers/convert_graph_to_onnx.py && \ # cp patches/transformers/$TRANSFORMERS_VERSION/modeling_distilbert.py $PYTHON_ROOT/transformers/models/distilbert/modeling_distilbert.py && \ # cp patches/nemo/${NEMO_VERSION}/nlp/distilbert.py $NEMO_PATH/collections/nlp/modules/common/huggingface/distilbert.py && \ # cp patches/nemo/${NEMO_VERSION}/exportable.py $NEMO_PATH/core/classes/exportable.py # set Python to unicode ENV PYTHONIOENCODING=utf-8 # disable JupyterLab from auto-starting (inherited behavior from l4t-ml) CMD /bin/bash ================================================ FILE: Dockerfile.ros ================================================ # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a # copy of this software and associated documentation files (the "Software"), # to deal in the Software without restriction, including without limitation # the rights to use, copy, modify, merge, publish, distribute, sublicense, # and/or sell copies of the Software, and to permit persons to whom the # Software is furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in # all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER # DEALINGS IN THE SOFTWARE. ARG BASE_IMAGE=jetson-voice:r32.5.0-foxy-base FROM ${BASE_IMAGE} ################################################################ ## install jetson_voice_ros package ################################################################ COPY ros /tmp/jetson_voice_ros RUN source ${ROS_ROOT}/install/setup.bash && \ mkdir -p ${ROS_ROOT}/src && \ cd ${ROS_ROOT} && \ cp -r /tmp/jetson_voice_ros src && \ # build the package colcon build \ --merge-install \ --base-paths src/jetson_voice_ros \ --event-handlers console_direct+ && \ # clean-up build files rm -rf ${ROS_ROOT}/src && \ rm -rf ${ROS_ROOT}/logs && \ rm -rf ${ROS_ROOT}/build ################################################################ ## project install ################################################################ ARG WORKSPACE=/jetson-voice COPY jetson_voice ${WORKSPACE}/jetson_voice COPY examples ${WORKSPACE}/examples COPY scripts ${WORKSPACE}/scripts COPY tests ${WORKSPACE}/tests ENV PYTHONPATH="${WORKSPACE}:${PYTHONPATH}" ================================================ FILE: Dockerfile.runtime ================================================ # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a # copy of this software and associated documentation files (the "Software"), # to deal in the Software without restriction, including without limitation # the rights to use, copy, modify, merge, publish, distribute, sublicense, # and/or sell copies of the Software, and to permit persons to whom the # Software is furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in # all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER # DEALINGS IN THE SOFTWARE. ARG BASE_IMAGE=jetson-voice:r32.5.0-base FROM ${BASE_IMAGE} ARG WORKSPACE=/jetson-voice WORKDIR ${WORKSPACE} ################################################################ ## project install ################################################################ COPY jetson_voice ${WORKSPACE}/jetson_voice COPY examples ${WORKSPACE}/examples COPY scripts ${WORKSPACE}/scripts COPY tests ${WORKSPACE}/tests ENV PYTHONPATH="${WORKSPACE}:${PYTHONPATH}" ================================================ FILE: Dockerfile.x86_64 ================================================ # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a # copy of this software and associated documentation files (the "Software"), # to deal in the Software without restriction, including without limitation # the rights to use, copy, modify, merge, publish, distribute, sublicense, # and/or sell copies of the Software, and to permit persons to whom the # Software is furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in # all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER # DEALINGS IN THE SOFTWARE. ARG BASE_IMAGE FROM ${BASE_IMAGE} ENV DEBIAN_FRONTEND=noninteractive ENV SHELL /bin/bash ARG MAKEFLAGS=-j$(nproc) ARG WORKSPACE=/jetson-voice WORKDIR ${WORKSPACE} ################################################################ ## PyCUDA ################################################################ RUN pip3 install pycuda six --verbose ################################################################ ## ctc-decoders ################################################################ RUN apt-get update && \ apt-get install -y --no-install-recommends \ swig \ && rm -rf /var/lib/apt/lists/* \ && apt-get clean RUN git clone https://github.com/dusty-nv/OpenSeq2Seq -b ctc-decoders && \ cd OpenSeq2Seq/decoders && \ ./setup.sh RUN pip3 install git+https://github.com/NVIDIA/dllogger RUN pip3 install nltk ################################################################ ## Jarvis GRPC ################################################################ ARG JARVIS_URL=https://nvidia.box.com/shared/static/on9t7zqes2s6er1wpumidnc6rphwsyy7.whl ARG JARVIS_WHL=jarvis_api-1.0.0b1-py3-none-any.whl RUN wget --quiet --show-progress --progress=bar:force:noscroll --no-check-certificate ${JARVIS_URL} -O ${JARVIS_WHL} && \ pip3 install ${JARVIS_WHL} --verbose && \ rm ${JARVIS_WHL} ################################################################ ## install some audio stuff ################################################################ RUN apt-get update && \ apt-get install -y --no-install-recommends \ alsa-base \ libasound2-dev \ alsa-utils \ portaudio19-dev \ libsndfile1 \ unzip \ tzdata \ nano \ && rm -rf /var/lib/apt/lists/* \ && apt-get clean RUN pip3 install soundfile pyaudio wave ################################################################ ## various patches to install ################################################################ COPY patches patches ARG NEMO_VERSION RUN PYTHON_ROOT=`pip3 show transformers | grep Location: | cut -d' ' -f2` && \ TRANSFORMERS_VERSION=`pip3 show transformers | grep Version: | cut -d' ' -f2` && \ echo "Python package root path: $PYTHON_ROOT" && \ echo "Applying patches for transformers $TRANSFORMERS_VERSION" && \ cp patches/transformers/$TRANSFORMERS_VERSION/convert_graph_to_onnx.py $PYTHON_ROOT/transformers/convert_graph_to_onnx.py && \ cp patches/transformers/$TRANSFORMERS_VERSION/modeling_distilbert.py $PYTHON_ROOT/transformers/models/distilbert/modeling_distilbert.py && \ cp patches/nemo/${NEMO_VERSION}/nlp/distilbert.py $PYTHON_ROOT/nemo/collections/nlp/modules/common/huggingface/distilbert.py && \ cp patches/nemo/${NEMO_VERSION}/exportable.py $PYTHON_ROOT/nemo/core/classes/exportable.py # set Python to unicode ENV PYTHONIOENCODING=utf-8 ================================================ FILE: README.md ================================================ # jetson-voice jetson-voice is an ASR/NLP/TTS deep learning inference library for Jetson Nano, TX1/TX2, Xavier NX, and AGX Xavier. It supports Python and JetPack 4.4.1 or newer. The DNN models were trained with [NeMo](https://github.com/NVIDIA/NeMo) and deployed with [TensorRT](https://developer.nvidia.com/tensorrt) for optimized performance. All computation is performed using the onboard GPU. Currently the following capabilities are included: * [Automatic Speech Recognition (ASR)](#automatic-speech-recognition-asr) * [Streaming ASR (QuartzNet)](#automatic-speech-recognition-asr) * [Command/Keyword Recognition (MatchboxNet)](#commandkeyword-recognition) * [Voice Activity Detection (VAD Marblenet)](#voice-activity-detection-vad) * [Natural Language Processing (NLP)](#natural-language-processing-nlp) * [Joint Intent/Slot Classification](#joint-intentslot-classification) * [Text Classification (Sentiment Analysis)](#text-classification) * [Token Classification (Named Entity Recognition)](#token-classification) * [Question/Answering (QA)](#questionanswering) * [Text-to-Speech (TTS)](#text-to-speech-tts) The NLP models are using the [DistilBERT](https://arxiv.org/abs/1910.01108) transformer architecture for reduced memory usage and increased performance. For samples of the text-to-speech output, see the [TTS Audio Samples](#tts-audio-samples) section below. ## Running the Container jetson-voice is distributed as a Docker container due to the number of dependencies. There are pre-built containers images available on DockerHub for JetPack 4.4.1 and newer: ``` dustynv/jetson-voice:r32.4.4 # JetPack 4.4.1 (L4T R32.4.4) dustynv/jetson-voice:r32.5.0 # JetPack 4.5 (L4T R32.5.0) / JetPack 4.5.1 (L4T R32.5.1) dustynv/jetson-voice:r32.6.1 # JetPack 4.6 (L4T R32.6.1) dustynv/jetson-voice:r32.7.1 # JetPack 4.6.1 (L4T R32.7.1) ``` To download and run the container, you can simply clone this repo and use the `docker/run.sh` script: ``` bash $ git clone --branch dev https://github.com/dusty-nv/jetson-voice $ cd jetson-voice $ docker/run.sh ``` > **note**: if you want to use a USB microphone or speaker, plug it in *before* you start the container There are some optional arguments to `docker/run.sh` that you can use: * `-r` (`--run`) specifies a run command, otherwise the container will start in an interactive shell. * `-v` (`--volume`) mount a directory from the host into the container (`/host/path:/container/path`) * `--dev` starts the container in development mode, where all the source files are mounted for easy editing The run script will automatically mount the `data/` directory into the container, which stores the models and other data files. If you save files from the container there, they will also show up under `data/` on the host. ## Automatic Speech Recognition (ASR) The speech recognition in jetson-voice is a streaming service, so it's intended to be used on live sources and transcribes the audio in 1-second chunks. It uses a [QuartzNet-15x5](https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/asr/models.html#quartznet) model followed by a CTC beamsearch decoder and language model, to further refine the raw output of the network. It detects breaks in the audio to determine the end of sentences. For information about using the ASR APIs, please refer to [`jetson_voice/asr.py`](jetson_voice/asr.py) and see [`examples/asr.py`](examples/asr.py) After you start the container, first run a test audio file (wav/ogg/flac) through [`examples/asr.py`](examples/asr.py) to verify that the system is functional. Run this command (and all subsequent commands) inside the container: ``` bash $ examples/asr.py --wav data/audio/dusty.wav hi hi hi this is dust hi hi this is dusty check hi hi this is dusty check one two hi hi this is dusty check one two three hi hi this is dusty check one two three. what's the weather or what's the weather going to be tomorrow what's the weather going to be tomorrow in pittsburgh what's the weather going to be tomorrow in pittsburgh. today is today is wednesday today is wednesday tomorrow is thursday today is wednesday tomorrow is thursday. i would like i would like to order a large i would like to order a large pepperoni pizza i would like to order a large pepperoni pizza. is it going to be is it going to be cloudy tomorrow. ``` > The first time you run each model, TensorRT will take a few minutes to optimize it. > This optimized model is then cached to disk, so the next time you run the model it will load faster. #### Live Microphone To test the ASR on a mic, first list the audio devices in your system to get the audio device ID's: ``` bash $ scripts/list_audio_devices.sh ---------------------------------------------------- Audio Input Devices ---------------------------------------------------- Input Device ID 1 - 'tegra-snd-t210ref-mobile-rt565x: - (hw:1,0)' (inputs=16) (sample_rate=44100) Input Device ID 2 - 'tegra-snd-t210ref-mobile-rt565x: - (hw:1,1)' (inputs=16) (sample_rate=44100) Input Device ID 3 - 'tegra-snd-t210ref-mobile-rt565x: - (hw:1,2)' (inputs=16) (sample_rate=44100) Input Device ID 4 - 'tegra-snd-t210ref-mobile-rt565x: - (hw:1,3)' (inputs=16) (sample_rate=44100) Input Device ID 5 - 'tegra-snd-t210ref-mobile-rt565x: - (hw:1,4)' (inputs=16) (sample_rate=44100) Input Device ID 6 - 'tegra-snd-t210ref-mobile-rt565x: - (hw:1,5)' (inputs=16) (sample_rate=44100) Input Device ID 7 - 'tegra-snd-t210ref-mobile-rt565x: - (hw:1,6)' (inputs=16) (sample_rate=44100) Input Device ID 8 - 'tegra-snd-t210ref-mobile-rt565x: - (hw:1,7)' (inputs=16) (sample_rate=44100) Input Device ID 9 - 'tegra-snd-t210ref-mobile-rt565x: - (hw:1,8)' (inputs=16) (sample_rate=44100) Input Device ID 10 - 'tegra-snd-t210ref-mobile-rt565x: - (hw:1,9)' (inputs=16) (sample_rate=44100) Input Device ID 11 - 'Logitech H570e Mono: USB Audio (hw:2,0)' (inputs=2) (sample_rate=44100) Input Device ID 12 - 'Samson Meteor Mic: USB Audio (hw:3,0)' (inputs=2) (sample_rate=44100) ``` > If you don't see your audio device listed, exit and restart the container. > USB devices should be attached *before* the container is started. Then run the ASR example with the `--mic ` option, and specify either the device ID or name: ``` bash $ examples/asr.py --mic 11 hey hey how are you guys hey how are you guys. # (Press Ctrl+C to exit) ``` ## ASR Classification There are other ASR models included for command/keyword recognition ([MatchboxNet](https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/asr/speech_classification/models.html#matchboxnet-speech-commands)) and voice activity detection ([VAD MarbleNet](https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/asr/speech_classification/models.html#marblenet-vad)). These models are smaller and faster, and classify chunks of audio as opposed to transcribing text. ### Command/Keyword Recognition The [MatchboxNet](https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/asr/speech_classification/models.html#matchboxnet-speech-commands) model was trained on 12 keywords from the [Google Speech Commands](https://ai.googleblog.com/2017/08/launching-speech-commands-dataset.html) dataset: ``` # MatchboxNet classes "yes", "no", "up", "down", "left", "right", "on", "off", "stop", "go", "unknown", "silence" ``` You can run it through the same ASR example as above by specifying the `--model matchboxnet` argument: ``` bash $ examples/asr.py --model matchboxnet --wav data/audio/commands.wav class 'unknown' (0.384) class 'yes' (1.000) class 'no' (1.000) class 'up' (1.000) class 'down' (1.000) class 'left' (1.000) class 'left' (1.000) class 'right' (1.000) class 'on' (1.000) class 'off' (1.000) class 'stop' (1.000) class 'go' (1.000) class 'go' (1.000) class 'silence' (0.639) class 'silence' (0.576) ``` The numbers printed on the right are the classification probabilities between 0 and 1. ### Voice Activity Detection (VAD) The voice activity model ([VAD MarbleNet](https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/asr/speech_classification/models.html#marblenet-vad)) is a binary model that outputs `background` or `speech`: ``` bash $ examples/asr.py --model vad_marblenet --wav data/audio/commands.wav class 'background' (0.969) class 'background' (0.984) class 'background' (0.987) class 'speech' (0.997) class 'speech' (1.000) class 'speech' (1.000) class 'speech' (0.998) class 'background' (0.987) class 'speech' (1.000) class 'speech' (1.000) class 'speech' (1.000) class 'background' (0.988) class 'background' (0.784) ``` ## Natural Language Processing (NLP) There are two samples included for NLP: * [`examples/nlp.py`](examples/nlp.py) (intent/slot, text classification, token classification) * [`examples/nlp_qa.py`](examples/nlp_qa.py) (question/answering) These each use a [DistilBERT](https://arxiv.org/abs/1910.01108) model which has been fined-tuned for it's particular task. For information about using the NLP APIs, please refer to [`jetson_voice/nlp.py`](jetson_voice/nlp.py) and see the samples above. ### Joint Intent/Slot Classification Joint Intent and Slot classification is a task of classifying an Intent and detecting all relevant Slots (Entities) for this Intent in a query. For example, in the query: `What is the weather in Santa Clara tomorrow morning?`, we would like to classify the query as a `weather` Intent, and detect `Santa Clara` as a location slot and `tomorrow morning` as a date_time slot. Intents and Slots names are usually task specific and defined as labels in the training data. The included intent/slot model was trained on the [NLU-Evaluation-Data](https://github.com/xliuhw/NLU-Evaluation-Data) dataset - you can find the various intent and slot classes that it supports [here](https://gist.github.com/dusty-nv/119474dfcf3bfccfbb8428951a64cd23). They are common things that you might ask a virtual assistant: ``` $ examples/nlp.py --model distilbert_intent Enter intent_slot query, or Q to quit: > What is the weather in Santa Clara tomorrow morning? {'intent': 'weather_query', 'score': 0.7165476, 'slots': [{'score': 0.6280392, 'slot': 'place_name', 'text': 'Santa'}, {'score': 0.61760694, 'slot': 'place_name', 'text': 'Clara'}, {'score': 0.5439486, 'slot': 'date', 'text': 'tomorrow'}, {'score': 0.4520608, 'slot': 'date', 'text': 'morning'}]} > Set an alarm for 730am {'intent': 'alarm_set', 'score': 0.5713072, 'slots': [{'score': 0.40017933, 'slot': 'time', 'text': '730am'}]} > Turn up the volume {'intent': 'audio_volume_up', 'score': 0.33523008, 'slots': []} > What is my schedule for tomorrow? {'intent': 'calendar_query', 'score': 0.37434494, 'slots': [{'score': 0.5732627, 'slot': 'date', 'text': 'tomorrow'}]} > Order a pepperoni pizza from domino's {'intent': 'takeaway_order', 'score': 0.50629586, 'slots': [{'score': 0.27558547, 'slot': 'food_type', 'text': 'pepperoni'}, {'score': 0.2778827, 'slot': 'food_type', 'text': 'pizza'}, {'score': 0.21785143, 'slot': 'business_name', 'text': 'dominos'}]} > Where's the closest Starbucks? {'intent': 'recommendation_locations', 'score': 0.5438984, 'slots': [{'score': 0.1604197, 'slot': 'place_name', 'text': 'Starbucks'}]} ``` ### Text Classification In this text classification example, we'll use the included sentiment analysis model that was trained on the [Standford Sentiment Treebank (SST-2)](https://nlp.stanford.edu/sentiment/index.html) dataset. It will label queries as either positive or negative, along with their probability: ``` $ examples/nlp.py --model distilbert_sentiment Enter text_classification query, or Q to quit: > today was warm, sunny and beautiful out {'class': 1, 'label': '1', 'score': 0.9985898} > today was cold and rainy and not very nice {'class': 0, 'label': '0', 'score': 0.99136007} ``` (class 0 is negative sentiment and class 1 is positive sentiment) ### Token Classification Whereas text classification classifies entire queries, token classification classifies individual tokens (or words). In this example, we'll be performing Named Entity Recognition (NER), which is the task of detecting and classifying key information (entities) in text. For example, in a sentence: `Mary lives in Santa Clara and works at NVIDIA`, we should detect that `Mary` is a person, `Santa Clara` is a location and `NVIDIA` is a company. The included token classification model for NER was trained on the [Groningen Meaning Bank (GMB)](http://www.let.rug.nl/bjerva/gmb/about.php) and supports the following annotations in [IOB format](https://en.wikipedia.org/wiki/Inside%E2%80%93outside%E2%80%93beginning_(tagging)) (short for inside, outside, beginning) * LOC = Geographical Entity * ORG = Organization * PER = Person * GPE = Geopolitical Entity * TIME = Time indicator * MISC = Artifact, Event, or Natural Phenomenon ``` bash $ examples/nlp.py --model distilbert_ner Enter token_classification query, or Q to quit: > Mary lives in Santa Clara and works at NVIDIA Mary[B-PER 0.989] lives in Santa[B-LOC 0.998] Clara[I-LOC 0.996] and works at NVIDIA[B-ORG 0.967] > Lisa's favorite place to climb in the summer is El Capitan in Yosemite National Park in California, U.S. Lisa's[B-PER 0.995] favorite place to climb in the summer[B-TIME 0.996] is El[B-PER 0.577] Capitan[I-PER 0.483] in Yosemite[B-LOC 0.987] National[I-LOC 0.988] Park[I-LOC 0.98] in California[B-LOC 0.998], U.S[B-LOC 0.997]. ``` ### Question/Answering Question/Answering (QA) works by supplying a context paragraph which the model then queries the best answer from. The [`nlp_qa.py`](examples/nlp_qa.py) example allows you to select from several built-in context paragraphs (or supply your own) and to ask questions about these topics. The QA model is flexible and doesn't need re-trained on different topics, as it was trained on the [SQuAD](https://rajpurkar.github.io/SQuAD-explorer/) question/answering dataset which allows it to extract answers from a variety of contexts. It essentially learns to identify the information most relevant to your query from the context passage, as opposed to learning the content itself. ``` bash $ examples/nlp_qa.py Context: The Amazon rainforest is a moist broadleaf forest that covers most of the Amazon basin of South America. This basin encompasses 7,000,000 square kilometres (2,700,000 sq mi), of which 5,500,000 square kilometres (2,100,000 sq mi) are covered by the rainforest. The majority of the forest is contained within Brazil, with 60% of the rainforest, followed by Peru with 13%, and Colombia with 10%. Enter a question, C to change context, P to print context, or Q to quit: > How big is the Amazon? Answer: 7,000,000 square kilometres Score: 0.24993503093719482 > which country has the most? Answer: Brazil Score: 0.5964332222938538 ``` To change the topic or create one of your own, enter `C`: ``` Enter a question, C to change context, P to print context, or Q to quit: > C Select from one of the following topics, or enter your own context paragraph: 1. Amazon 2. Geology 3. Moon Landing 4. Pi 5. Super Bowl 55 > 3 Context: The first manned Moon landing was Apollo 11 on July, 20 1969. The first human to step on the Moon was astronaut Neil Armstrong followed second by Buzz Aldrin. They landed in the Sea of Tranquility with their lunar module the Eagle. They were on the lunar surface for 2.25 hours and collected 50 pounds of moon rocks. Enter a question, C to change context, P to print context, or Q to quit: > Who was the first man on the moon? Answer: Neil Armstrong Score: 0.39105066657066345 ``` ## Text-to-Speech (TTS) The text-to-speech service uses an ensemble of two models: FastPitch to generate MEL spectrograms from text, and HiFiGAN as the vocoder (female English voice). For information about using the TTS APIs, please refer to [`jetson_voice/tts.py`](jetson_voice/tts.py) and see [`examples/tts.py`](examples/tts.py) The [`examples/tts.py`](examples/tts.py) app can output the audio to a speaker, wav file, or sequence of wav files. Run it with `--list-devices` to get a list of your audio devices. ``` bash $ examples/tts.py --output-device 11 --output-wav data/audio/tts_test > The weather tomorrow is forecast to be warm and sunny with a high of 83 degrees. Run 0 -- Time to first audio: 1.820s. Generated 5.36s of audio. RTFx=2.95. Run 1 -- Time to first audio: 0.232s. Generated 5.36s of audio. RTFx=23.15. Run 2 -- Time to first audio: 0.230s. Generated 5.36s of audio. RTFx=23.31. Run 3 -- Time to first audio: 0.231s. Generated 5.36s of audio. RTFx=23.25. Run 4 -- Time to first audio: 0.230s. Generated 5.36s of audio. RTFx=23.36. Run 5 -- Time to first audio: 0.230s. Generated 5.36s of audio. RTFx=23.35. Wrote audio to data/audio/tts_test/0.wav Enter text, or Q to quit: > Sally sells seashells by the seashore. Run 0 -- Time to first audio: 0.316s. Generated 2.73s of audio. RTFx=8.63. Run 1 -- Time to first audio: 0.126s. Generated 2.73s of audio. RTFx=21.61. Run 2 -- Time to first audio: 0.127s. Generated 2.73s of audio. RTFx=21.51. Run 3 -- Time to first audio: 0.126s. Generated 2.73s of audio. RTFx=21.68. Run 4 -- Time to first audio: 0.126s. Generated 2.73s of audio. RTFx=21.68. Run 5 -- Time to first audio: 0.126s. Generated 2.73s of audio. RTFx=21.61. Wrote audio to data/audio/tts_test/1.wav ``` #### TTS Audio Samples * [Weather forecast](data/audio/tts_examples/0.wav) (wav) * [Sally sells seashells](data/audio/tts_examples/1.wav) (wav) ## Tests There is an automated test suite included that will verify all of the models are working properly. You can run it with the `tests/run_tests.py` script: ``` bash $ tests/run_tests.py ---------------------------------------------------- TEST SUMMARY ---------------------------------------------------- test_asr.py (quartznet) PASSED test_asr.py (quartznet_greedy) PASSED test_asr.py (matchboxnet) PASSED test_asr.py (vad_marblenet) PASSED test_nlp.py (distilbert_qa_128) PASSED test_nlp.py (distilbert_qa_384) PASSED test_nlp.py (distilbert_intent) PASSED test_nlp.py (distilbert_sentiment) PASSED test_nlp.py (distilbert_ner) PASSED test_tts.py (fastpitch_hifigan) PASSED passed 10 of 10 tests saved logs to data/tests/logs/20210610_1512 ``` The logs of the individual tests are printed to the screen and saved to a timestamped directory. ================================================ FILE: docker/build.sh ================================================ #!/usr/bin/env bash ROS_DISTRO=${1:-"none"} BASE_IMAGE=$2 NEMO_VERSION="1.0.0rc1" # find container tag from os version source docker/tag.sh if [ $ARCH = "aarch64" ]; then if [ -z $BASE_IMAGE ]; then if [ $L4T_VERSION = "32.7.1" ]; then BASE_IMAGE="l4t-ml:r32.7.1-py3" #BASE_IMAGE="nvcr.io/nvidia/l4t-ml:r32.7.1-py3" NEMO_VERSION="1.6.2" elif [ $L4T_VERSION = "32.6.1" ]; then BASE_IMAGE="nvcr.io/nvidia/l4t-ml:r32.6.1-py3" elif [ $L4T_VERSION = "32.5.0" ] || [ $L4T_VERSION = "32.5.1" ]; then BASE_IMAGE="nvcr.io/nvidia/l4t-ml:r32.5.0-py3" elif [ $L4T_VERSION = "32.4.4" ]; then BASE_IMAGE="nvcr.io/nvidia/l4t-ml:r32.4.4-py3" elif [ $L4T_VERSION = "32.4.3" ]; then BASE_IMAGE="nvcr.io/nvidia/l4t-ml:r32.4.3-py3" elif [ $L4T_VERSION = "32.4.2" ]; then BASE_IMAGE="nvcr.io/nvidia/l4t-ml:r32.4.2-py3" else echo "cannot build jetson-voice docker container for L4T R$L4T_VERSION" echo "please upgrade to the latest JetPack, or build jetson-voice natively" exit 1 fi fi elif [ $ARCH = "x86_64" ]; then BASE_IMAGE=${BASE_IMAGE:-"nvcr.io/nvidia/nemo:$NEMO_VERSION"} fi VOICE_CONTAINER="$CONTAINER_NAME:$TAG" VOICE_CONTAINER_BASE="$VOICE_CONTAINER-base" # build the base container echo "CONTAINER=$VOICE_CONTAINER_BASE" echo "BASE_IMAGE=$BASE_IMAGE" sudo docker build -t $VOICE_CONTAINER_BASE -f Dockerfile.$ARCH \ --build-arg BASE_IMAGE=$BASE_IMAGE \ --build-arg NEMO_VERSION=$NEMO_VERSION \ . # build the runtime container echo "CONTAINER=$VOICE_CONTAINER" echo "BASE_IMAGE=$VOICE_CONTAINER_BASE" sudo docker build -t $VOICE_CONTAINER -f Dockerfile.runtime \ --build-arg BASE_IMAGE=$VOICE_CONTAINER_BASE \ . # build ROS version of container if [[ "$ROS_DISTRO" != "none" ]] && [[ $ARCH = "aarch64" ]]; then ROS_CONTAINER="$VOICE_CONTAINER-ros-$ROS_DISTRO" ROS_CONTAINER_BASE="$ROS_CONTAINER-base" # copy files needed to build ROS container if [ ! -d "packages/" ]; then cp -r docker/containers/packages packages fi # opencv.csv mounts files that preclude us installing different version of opencv # temporarily disable the opencv.csv mounts while we build the container CV_CSV="/etc/nvidia-container-runtime/host-files-for-container.d/opencv.csv" if [ -f "$CV_CSV" ]; then sudo mv $CV_CSV $CV_CSV.backup fi # build ROS on top of jetson-voice echo "CONTAINER=$ROS_CONTAINER_BASE" echo "BASE_IMAGE=$VOICE_CONTAINER_BASE" sudo docker build -t $ROS_CONTAINER_BASE -f docker/containers/Dockerfile.ros.$ROS_DISTRO \ --build-arg BASE_IMAGE=$VOICE_CONTAINER_BASE \ . # install jetson_voice_ros package echo "CONTAINER=$ROS_CONTAINER" echo "BASE_IMAGE=$ROS_CONTAINER_BASE" sudo docker build -t $ROS_CONTAINER -f Dockerfile.ros \ --build-arg BASE_IMAGE=$ROS_CONTAINER_BASE \ . # restore opencv.csv mounts if [ -f "$CV_CSV.backup" ]; then sudo mv $CV_CSV.backup $CV_CSV fi fi ================================================ FILE: docker/push.sh ================================================ #!/usr/bin/env bash ROS_DISTRO=${1:-"foxy"} source docker/tag.sh # push image push() { local remote_image="dustynv/$1" sudo docker rmi $remote_image sudo docker tag $1 $remote_image echo "pushing image $remote_image" sudo docker push $remote_image echo "done pushing image $remote_image" } push "$CONTAINER_NAME:$TAG" ROS_CONTAINER="$CONTAINER_NAME:$TAG-ros-$ROS_DISTRO" push "$ROS_CONTAINER" ================================================ FILE: docker/run.sh ================================================ #!/usr/bin/env bash # # Start an instance of the jetson-voice docker container. # See below or run this script with -h or --help to see usage options. # # This script should be run from the root dir of the jetson-voice project: # # $ cd /path/to/your/jetson-voice # $ docker/run.sh # show_help() { echo " " echo "usage: Starts the Docker container and runs a user-specified command" echo " " echo " ./docker/run.sh --container DOCKER_IMAGE" echo " --volume HOST_DIR:MOUNT_DIR" echo " --run RUN_COMMAND" echo " " echo "args:" echo " " echo " --help Show this help text and quit" echo " " echo " -c, --container DOCKER_IMAGE Specifies the name of the Docker container" echo " image to use (default: 'jetson-voice')" echo " " echo " --ros ROS_DISTRO Starts the version of the container using the" echo " specified ROS distro (or foxy if not specified)" echo " This is overridden by the --container argument" echo " " echo " -d, --dev Runs the container in development mode, where the source" echo " files are mounted into the container dynamically, so they" echo " can more easily be edited from the host machine." echo " " echo " -v, --volume HOST_DIR:MOUNT_DIR Mount a path from the host system into" echo " the container. Should be specified as:" echo " " echo " -v /my/host/path:/my/container/path" echo " " echo " (these should be absolute paths)" echo " " echo " -r, --run RUN_COMMAND Command to run once the container is started." echo " Note that this argument must be invoked last," echo " as all further arguments will form the command." echo " If no run command is specified, an interactive" echo " terminal into the container will be provided." echo " " } die() { printf '%s\n' "$1" show_help exit 1 } # find container tag from os version source docker/tag.sh # where the project resides inside docker DOCKER_ROOT="/jetson-voice" # generate mount commands DATA_VOLUME="--volume $PWD/data:$DOCKER_ROOT/data" DEV_VOLUME="" # parse user arguments USER_VOLUME="" USER_COMMAND="" while :; do case $1 in -h|-\?|--help) show_help # Display a usage synopsis. exit ;; -c|--container) # Takes an option argument; ensure it has been specified. if [ "$2" ]; then CONTAINER_IMAGE=$2 shift else die 'ERROR: "--container" requires a non-empty option argument.' fi ;; --container=?*) CONTAINER_IMAGE=${1#*=} # Delete everything up to "=" and assign the remainder. ;; --container=) # Handle the case of an empty --image= die 'ERROR: "--container" requires a non-empty option argument.' ;; --ros) if [ "$2" ]; then ROS_DISTRO=$2 shift else ROS_DISTRO="foxy" fi ;; --ros=?*) ROS_DISTRO=${1#*=} # Delete everything up to "=" and assign the remainder. ;; --ros=) # Handle the case of an empty --image= ROS_DISTRO="foxy" ;; -d|--dev) DEV_VOLUME="--volume $PWD/jetson_voice:$DOCKER_ROOT/jetson_voice --volume $PWD/examples:$DOCKER_ROOT/examples --volume $PWD/scripts:$DOCKER_ROOT/scripts --volume $PWD/tests:$DOCKER_ROOT/tests" ;; -v|--volume) if [ "$2" ]; then USER_VOLUME=" -v $2 " shift else die 'ERROR: "--volume" requires a non-empty option argument.' fi ;; --volume=?*) USER_VOLUME=" -v ${1#*=} " # Delete everything up to "=" and assign the remainder. ;; --volume=) # Handle the case of an empty --image= die 'ERROR: "--volume" requires a non-empty option argument.' ;; -r|--run) if [ "$2" ]; then shift USER_COMMAND=" $@ " else die 'ERROR: "--run" requires a non-empty option argument.' fi ;; --) # End of all options. shift break ;; -?*) printf 'WARN: Unknown option (ignored): %s\n' "$1" >&2 ;; *) # Default case: No more options, so break out of the loop. break esac shift done # select the container, unless --container was explicitly specified if [ -z "$CONTAINER_IMAGE" ]; then CONTAINER_IMAGE="$CONTAINER_NAME:$TAG" if [ -n "$ROS_DISTRO" ]; then CONTAINER_IMAGE="$CONTAINER_NAME:$TAG-ros-$ROS_DISTRO" fi CONTAINER_REMOTE_IMAGE="dustynv/$CONTAINER_IMAGE" # check for local image if [[ "$(sudo docker images -q $CONTAINER_IMAGE 2> /dev/null)" == "" ]]; then CONTAINER_IMAGE=$CONTAINER_REMOTE_IMAGE fi fi echo "CONTAINER: $CONTAINER_IMAGE" echo "DEV_VOLUME: $DEV_VOLUME" echo "DATA_VOLUME: $DATA_VOLUME" echo "USER_VOLUME: $USER_VOLUME" echo "USER_COMMAND: $USER_COMMAND" MOUNTS="\ --device /dev/snd \ --device /dev/bus/usb \ --volume /etc/timezone:/etc/timezone:ro \ --volume /etc/localtime:/etc/localtime:ro \ $DEV_VOLUME \ $DATA_VOLUME \ $USER_VOLUME" if [ $ARCH = "aarch64" ]; then sudo docker run --runtime nvidia -it --rm \ --name=$CONTAINER_NAME \ --network host \ $MOUNTS $CONTAINER_IMAGE $USER_COMMAND elif [ $ARCH = "x86_64" ]; then sudo docker run --gpus all -it --rm \ --name=$CONTAINER_NAME \ --network=host \ --shm-size=8g \ --ulimit memlock=-1 \ --ulimit stack=67108864 \ $MOUNTS $CONTAINER_IMAGE $USER_COMMAND fi ================================================ FILE: docker/tag.sh ================================================ #!/usr/bin/env bash # find OS version source scripts/os_version.sh if [ $ARCH = "aarch64" ]; then TAG="r$L4T_VERSION" if [ $L4T_VERSION = "32.5.1" ] || [ $L4T_VERSION = "32.5.2" ]; then TAG="r32.5.0" fi elif [ $ARCH = "x86_64" ]; then TAG="$ARCH" else echo "unsupported architecture: $ARCH" exit 1 fi CONTAINER_NAME="jetson-voice" ================================================ FILE: examples/asr.py ================================================ #!/usr/bin/env python3 # coding: utf-8 import sys from jetson_voice import ASR, AudioInput, ConfigArgParser, list_audio_devices parser = ConfigArgParser() parser.add_argument('--model', default='quartznet', type=str, help='path to model, service name, or json config file') parser.add_argument('--wav', default=None, type=str, help='path to input wav/ogg/flac file') parser.add_argument('--mic', default=None, type=str, help='device name or number of input microphone') parser.add_argument('--list-devices', action='store_true', help='list audio input devices') args = parser.parse_args() print(args) # list audio devices if args.list_devices: list_audio_devices() sys.exit() # load the model asr = ASR(args.model) # create the audio input stream stream = AudioInput(wav=args.wav, mic=args.mic, sample_rate=asr.sample_rate, chunk_size=asr.chunk_size) # run transcription for samples in stream: results = asr(samples) if asr.classification: print(f"class '{results[0]}' ({results[1]:.3f})") else: for transcript in results: print(transcript['text']) if transcript['end']: print('') print('\naudio stream closed.') ================================================ FILE: examples/assistant.py ================================================ #!/usr/bin/env python3 # coding: utf-8 import sys import pprint from jetson_voice import ( ASR, NLP, TTS, AudioInput, AudioOutput, list_audio_devices, ConfigArgParser ) parser = ConfigArgParser() parser.add_argument('--asr-model', default='quartznet', type=str, help='ASR model') parser.add_argument('--nlp-model', default='distilbert_intent', type=str, help='NLP model') parser.add_argument('--tts-model', default='fastpitch_hifigan', type=str, help='TTS model') parser.add_argument('--wav', default=None, type=str, help='path to input wav/ogg/flac file') parser.add_argument('--mic', default=None, type=str, help='device name or number of input microphone') parser.add_argument('--output-device', default=None, type=str, help='device name or number of audio output') parser.add_argument('--list-devices', action='store_true', help='list audio input devices') args = parser.parse_args() print(args) # list audio devices if args.list_devices: list_audio_devices() sys.exit() # load the models tts = TTS(args.tts_model) asr = ASR(args.asr_model, add_punctuation=False) nlp = NLP(args.nlp_model) if asr.classification: raise ValueError(f"'{args.asr_model}' is a classification model - must use a transcription model for agent") if nlp.config.type != 'intent_slot': raise ValueError(f"'{args.nlp_model}' has type '{nlp.config.type}' - the agent requires an intent_slot model") # create the audio streams audio_input = AudioInput(wav=args.wav, mic=args.mic, sample_rate=asr.sample_rate, chunk_size=asr.chunk_size) audio_output = AudioOutput(device=args.output_device, sample_rate=tts.sample_rate) def get_slot(results, name, default='', threshold=0, merge=True): """ Retrieve a slot by name from the intent/slot results. The name can be a list of names, and any of them will be matched. Only slots with a score above the threshold will be returned. If merge is true, all slots by that name will be combined. If merge is false, the first matching slot will be returned. """ if isinstance(name, str): name = [name] slots = [] for slot in results['slots']: if any(slot['slot'] == n for n in name) and slot['score'] >= threshold: slots.append(slot['text']) if len(slots) == 0: return default if len(slots) > 1 and merge: return ' '.join(slots) return slots[0] def generate_response(query): results = nlp(query) pprint.pprint(results) intent = results['intent'] if intent == 'general_praise': return "Why thank you very much!" elif intent == 'weather_query': place = get_slot(results, 'place_name') date = get_slot(results, 'date') response = "The weather " if place: response += 'in ' + place + ' ' if date: response += date + ' ' return response + "is forecast to be sunny with a high of 78 degrees." elif intent == 'recommendation_locations': place = get_slot(results, ['place_name', 'business_name']) if not place: return "Please ask again with the name of a store or restaurant." return f"{place} is located 1 mile away at 1 2 3 Main Street." return "I'm sorry, I don't understand." # run agent for input_samples in audio_input: transcripts = asr(input_samples) for transcript in transcripts: print(transcript['text']) if not transcript['end']: continue print('') response = generate_response(transcript['text']) print(response) audio_output.write(tts(response)) """ if transcripts[0] != 'unknown' and transcripts[1] != 'silence': response = generate_response(transcripts[0]) print(response) audio_output.write(tts(response)) """ ================================================ FILE: examples/nlp.py ================================================ #!/usr/bin/env python3 # coding: utf-8 import sys import pprint import readline from jetson_voice import NLP, ConfigArgParser parser = ConfigArgParser() parser.add_argument('--model', default='distilbert_sentiment', type=str) args = parser.parse_args() print(args) # load the model model = NLP(args.model) # QA models should run the nlp_qa.py example type = model.config.type if type == 'qa': raise ValueError("please run Question/Answer models with the nlp_qa.py sample") while True: print(f'\nEnter {type} query, or Q to quit:') query = input('> ') if query.upper() == 'Q': sys.exit() print('') results = model(query) if type == 'intent_slot' or type == 'text_classification': pprint.pprint(results) elif type == 'token_classification': print(f'{model.tag_string(query, results, scores=True)}') ================================================ FILE: examples/nlp_qa.py ================================================ #!/usr/bin/env python3 # coding: utf-8 import sys import readline from jetson_voice import QuestionAnswer, ConfigArgParser parser = ConfigArgParser() parser.add_argument('--model', default='distilbert_qa_384', type=str) parser.add_argument('--top_k', default=1, type=int, help='show the top N answers (default 1)') args = parser.parse_args() print(args) model = QuestionAnswer(args.model) # load the QA model builtin_context = { "Amazon" : "The Amazon rainforest is a moist broadleaf forest that covers most of the Amazon basin of South America. " "This basin encompasses 7,000,000 square kilometres (2,700,000 sq mi), of which 5,500,000 square kilometres " "(2,100,000 sq mi) are covered by the rainforest. The majority of the forest is contained within Brazil, " "with 60% of the rainforest, followed by Peru with 13%, and Colombia with 10%.", "Geology" : "There are three major types of rock: igneous, sedimentary, and metamorphic. Igneous rocks are formed from " "melted rock deep inside the Earth. Sedimentary rocks are compressed layers of sand, silt, dead plants, and " "animal skeletons. Metamorphic rocks are other rocks that are changed by heat and pressure underground.", "Moon Landing" : "The first manned Moon landing was Apollo 11 on July, 20 1969. The first human to step on the Moon was " "astronaut Neil Armstrong followed second by Buzz Aldrin. They landed in the Sea of Tranquility with their " "lunar module the Eagle. They were on the lunar surface for 2.25 hours and collected 50 pounds of moon rocks.", "Pi" : "Some people have said that Pi is tasty but there should be a value for Pi, and the value for Pi is around 3.14. " "Pi is the ratio of a circle's circumference to it's diameter. The constant Pi was first calculated by Archimedes " "in ancient Greece around the year 250 BC.", "Super Bowl 55" : "Super Bowl 55 took place on February 7, 2021 in Tampa, Florida between the Kansas City Chiefs and " "the Tampa Bay Buccaneers. The Tampa Bay Buccaneers won by a score of 31 to 9. In his first season " "with Tampa Bay, it was quarterback Tom Brady's seventh Super Bowl win in nine appearances.", } context = builtin_context['Amazon'] def print_context(): print('\nContext:') print(context) def parse_commands(entry): """ Parse 'C' command for changing context, 'P' to print context, and 'Q' for quit. Returns true if a command was entered, otherwise false. """ global context if entry == 'C': print('\nSelect from one of the following topics, or enter your own context paragraph:') for idx, key in enumerate(builtin_context): print(f' {idx+1}. {key}') entry = input('> ') try: # try parsing as a number num = int(entry) if num > 0 and num <= len(builtin_context): context = builtin_context[list(builtin_context.keys())[num-1]] else: print('Invalid entry') except: # try looking up topic name, otherwise custom paragraph if entry in builtin_context: context = builtin_context[entry.lower()] else: context = entry print_context() return True elif entry == 'P': print_context() return True elif entry == 'Q': sys.exit() return False print_context() while True: print('\nEnter a question, C to change context, P to print context, or Q to quit:') entry = input('> ') if parse_commands(entry.upper()): continue query = { 'context' : context, 'question' : entry } results = model(query, top_k=args.top_k) if args.top_k == 1: results = [results] for result in results: print('\nAnswer:', result['answer']) print('Score: ', result['score']) ================================================ FILE: examples/tts.py ================================================ #!/usr/bin/env python3 # coding: utf-8 import os import sys import time import readline from jetson_voice import TTS, ConfigArgParser, AudioOutput, list_audio_devices from soundfile import SoundFile parser = ConfigArgParser() parser.add_argument('--model', default='fastpitch_hifigan', type=str) parser.add_argument('--warmup', default=5, type=int, help='the number of warmup runs') parser.add_argument("--output-device", default=None, type=str, help='output audio device to use') parser.add_argument("--output-wav", default=None, type=str, help='output directory or wav file to write to') parser.add_argument('--list-devices', action='store_true', help='list audio input devices') args = parser.parse_args() print(args) # list audio devices if args.list_devices: list_audio_devices() sys.exit() # load the model tts = TTS(args.model) # open output audio device if args.output_device: audio_device = AudioOutput(args.output_device, tts.sample_rate) # create output wav directory if args.output_wav: wav_is_dir = len(os.path.splitext(args.output_wav)[1]) == 0 wav_count = 0 if wav_is_dir and not os.path.exists(args.output_wav): os.makedirs(args.output_wav) while True: print(f'\nEnter text, or Q to quit:') text = input('> ') if text.upper() == 'Q': sys.exit() print('') # run the TTS for run in range(args.warmup+1): start = time.perf_counter() audio = tts(text) stop = time.perf_counter() latency = stop-start duration = audio.shape[0]/tts.sample_rate print(f"Run {run} -- Time to first audio: {latency:.3f}s. Generated {duration:.2f}s of audio. RTFx={duration/latency:.2f}.") # output the audio if args.output_device: audio_device.write(audio) if args.output_wav: wav_path = os.path.join(args.output_wav, f'{wav_count}.wav') if wav_is_dir else args.output_wav wav = SoundFile(wav_path, mode='w', samplerate=tts.sample_rate, channels=1) wav.write(audio) wav.close() wav_count += 1 print(f"\nWrote audio to {wav_path}") ================================================ FILE: jetson_voice/__init__.py ================================================ #!/usr/bin/env python3 # coding: utf-8 from .utils import ( find_resource, list_models, global_config, ConfigDict, ConfigArgParser, list_audio_devices, list_audio_inputs, list_audio_outputs, AudioInput, AudioOutput ) from .asr import ASR, ASRService from .tts import TTS, TTSService from .nlp import (NLP, IntentSlot, IntentSlotService, QuestionAnswer, QuestionAnswerService, TextClassification, TextClassificationService, TokenClassification, TokenClassificationService, ) from .auto import AutoModel __version__ = global_config.version ================================================ FILE: jetson_voice/asr.py ================================================ #!/usr/bin/env python3 # coding: utf-8 from jetson_voice.utils import load_resource def ASR(resource, *args, **kwargs): """ Loads a streaming ASR service or model. See the ASRService class for the signature that implementations use. """ factory_map = { 'riva' : 'jetson_voice.backends.riva.RivaASRService', 'tensorrt' : 'jetson_voice.models.asr.ASREngine', 'onnxruntime' : 'jetson_voice.models.asr.ASREngine' } return load_resource(resource, factory_map, *args, **kwargs) class ASRService(): """ Streaming ASR service base class. """ def __init__(self, config, *args, **kwargs): self.config = config def __call__(self, samples): """ Transcribe streaming audio samples to text, returning the running phrase. Phrases are broken up when a break in the audio is detected (i.e. end of sentence) Parameters: samples (array) -- Numpy array of audio samples. Returns a list[dict] of the running transcripts with the following keys: text (string) -- the transcript of the current sentence words (list[dict]) -- a list of word dicts that make up the sentence end (bool) -- if true, end-of-sentence due to silence Each transcript represents one phrase/sentence. When a sentence has been determined to be ended, it will be marked with end=True. Multiple sentence transcripts can be returned if one just ended and another is beginning. """ pass @property def classification(self): """ Returns true if this is an ASR classification model (e.g. for VAD or keyword spotting) Otherwise, this is an ASR transcription model that converts audio to text. """ return False @property def sample_rate(self): """ The sample rate that the model runs at (in Hz) Input audio should be resampled to this rate. """ pass @property def frame_length(self): """ Duration in seconds per frame / chunk. """ pass @property def chunk_size(self): """ Number of samples per frame/chunk (equal to frame_length * sample_rate) """ pass if __name__ == "__main__": from jetson_voice import list_audio_devices, AudioInput, ConfigArgParser import sys parser = ConfigArgParser() parser.add_argument('--model', default='quartznet', type=str, help='path to model, service name, or json config file') parser.add_argument('--wav', default=None, type=str, help='path to input wav file') parser.add_argument('--mic', default=None, type=str, help='device name or number of input microphone') parser.add_argument('--list-devices', action='store_true', help='list audio input devices') args = parser.parse_args() print(args) # list audio devices if args.list_devices: list_audio_devices() sys.exit() # load the model asr = ASR(args.model) # create the audio input stream stream = AudioInput(wav=args.wav, mic=args.mic, sample_rate=asr.sample_rate, chunk_size=asr.chunk_size) # run transcription for samples in stream: #samples = audio_to_float(samples) #print(f'samples {samples.shape} ({audio_db(samples):.1f} dB)') results = asr(samples) if asr.classification: print(f"class '{results[0]}' ({results[1]:.3f})") else: for transcript in results: print(transcript['text']) if transcript['end']: print('') print('\naudio stream closed.') ================================================ FILE: jetson_voice/auto.py ================================================ #!/usr/bin/env python3 # coding: utf-8 from jetson_voice.asr import ASR from jetson_voice.nlp import IntentSlot, QuestionAnswer, TextClassification, TokenClassification from jetson_voice.tts import TTS from jetson_voice.utils import load_resource def AutoModel(resource, domain=None, *args, **kwargs): """ Factory for automatically loading models and services. First the config is loaded and the type is checked. Then the correct instance for the resource is created. If a domain string is supplied (e.g. 'asr', 'nlp', 'tts'), then only resources from that domain will be created. """ type_map = { # models 'asr' : (ASR, 'asr'), 'asr_classification' : (ASR, 'asr'), 'intent_slot' : (IntentSlot, 'nlp'), 'qa' : (QuestionAnswer, 'nlp'), 'text_classification' : (TextClassification, 'nlp'), 'token_classification' : (TokenClassification, 'nlp'), 'tts': (TTS, 'tts'), # services 'jarvis_asr' : (ASR, 'asr') } config = load_resource(resource, None, *args, **kwargs) if 'type' not in config: raise ValueError(f"'type' setting missing from config '{config.path}'") if config.type not in type_map: raise ValueError(f"'{config.path}' has invalid 'type' ({config.type})") if domain: if type_map[config.type][1] != domain.lower(): raise ValueError(f"invalid model selected - '{config.path}' has domain '{type_map[config.type][1]}', but AutoModel() was called with domain={domain}") return type_map[config.type][0](config, *args, **kwargs) ================================================ FILE: jetson_voice/backends/onnxruntime/__init__.py ================================================ #!/usr/bin/env python3 # coding: utf-8 from .ort_model import OnnxRuntimeModel ================================================ FILE: jetson_voice/backends/onnxruntime/ort_model.py ================================================ #!/usr/bin/env python3 # coding: utf-8 import os import logging # for some reason if PyCUDA isn't initialized before OnnxRuntime # and TensorRT is also used, it makes TensorRT error import pycuda.driver as cuda import pycuda.autoinit import numpy as np import onnxruntime as ort class OnnxRuntimeModel: """ Base class for OnnxRuntime models. """ def __init__(self, config, *args, **kwargs): """ Load an ONNX Runtime model. """ self.config = config logging.info(f"loading ONNX model '{self.config.model_path}' with onnxruntime") self.model = ort.InferenceSession(config.model_path, providers=['CUDAExecutionProvider']) logging.info(f"loaded ONNX model '{self.config.model_path}' with onnxruntime") self.inputs = self.model.get_inputs() self.outputs = self.model.get_outputs() for idx, binding in enumerate(self.inputs): print('') print(f"input {idx} - {binding.name}") print(f" shape: {binding.shape}") print(f" type: {binding.type}") print('') def execute(self, inputs, return_dict=False, **kwargs): """ Run the DNN model in TensorRT. The inputs are provided as numpy arrays in a list/tuple/dict. Note that run() doesn't perform any pre/post-processing - this is typically done in subclasses. Parameters: inputs (array, list[array], dict[array]) -- the network inputs as numpy array(s). If there is only one input, it can be provided as a single numpy array. If there are multiple inputs, they can be provided as numpy arrays in a list, tuple, or dict. Inputs in lists and tuples are assumed to be in the same order as the input bindings. Inputs in dicts should have keys with the same names as the input bindings. return_dict (bool) -- If True, the results will be returned in a dict of numpy arrays, where the keys are the names of the output binding names. By default, the results will be returned in a list of numpy arrays, in the same order as the output bindings. Returns the model output as a numpy array (if only one output), list[ndarray], or dict[ndarray]. """ if isinstance(inputs, np.ndarray): inputs = [inputs] assert len(inputs) == len(self.inputs) if isinstance(inputs, (list,tuple)): inputs = {self.inputs[i].name : input for i, input in enumerate(inputs)} elif not isinstance(inputs, dict): raise ValueError(f"inputs must be a list, tuple, or dict (instead got type '{type(inputs).__name__}')") outputs = self.model.run(None, inputs) if return_dict: return {self.outputs[i].name : output for i, output in enumerate(outputs)} if len(outputs) == 1: return outputs[0] return outputs ================================================ FILE: jetson_voice/backends/riva/__init__.py ================================================ #!/usr/bin/env python3 # coding: utf-8 from .riva_asr import RivaASRService from .riva_tts import RivaTTSService ================================================ FILE: jetson_voice/backends/riva/riva_asr.py ================================================ #!/usr/bin/env python3 # coding: utf-8 import os import grpc import queue import threading import logging import riva_api.audio_pb2 as ra import riva_api.riva_asr_pb2 as rasr import riva_api.riva_asr_pb2_grpc as rasr_srv from jetson_voice import ASRService from jetson_voice.utils import audio_to_int16 class RivaASRService(ASRService): """ Riva streaming ASR service. """ def __init__(self, config, *args, **kwargs): """ Open a streaming channel to the Riva server for ASR. This establishes a connection over GRPC and sends/recieves the requests and responses asynchronously. Incoming audio samples get put into a request queue that GRPC picks up, and a thread waits on responses to come in. """ super(RivaASRService, self).__init__(config, *args, **kwargs) self.config.setdefault('server', 'localhost:50051') self.config.setdefault('sample_rate', 16000) self.config.setdefault('frame_length', 1.0) self.config.setdefault('request_timeout', 2.0) # how long to wait for new audio to come in self.config.setdefault('response_timeout', 0.05) # how long to wait for results from riva self.config.setdefault('language_code', 'en-US') self.config.setdefault('enable_automatic_punctuation', True) self.config.setdefault('top_k', 1) logging.info(f'Riva ASR service config:\n{self.config}') self.channel = grpc.insecure_channel(self.config.server) self.client = rasr_srv.RivaSpeechRecognitionStub(self.channel) self.recognition_config = rasr.RecognitionConfig( encoding = ra.AudioEncoding.LINEAR_PCM, sample_rate_hertz = self.config.sample_rate, language_code = self.config.language_code, max_alternatives = self.config.top_k, enable_word_time_offsets = True, enable_automatic_punctuation = self.config.enable_automatic_punctuation ) self.streaming_config = rasr.StreamingRecognitionConfig( config = self.recognition_config, interim_results = True ) self.request_queue = queue.Queue() self.request_queue.put(rasr.StreamingRecognizeRequest(streaming_config=self.streaming_config)) self.responses = self.client.StreamingRecognize(self) self.responses_queue = queue.Queue() self.response_thread = threading.Thread(target=self.recieve_responses) self.response_thread.start() def __call__(self, samples): """ Transcribe streaming audio samples to text, returning the running phrase. Phrases are broken up when a break in the audio is detected (i.e. end of sentence) Parameters: samples (array) -- Numpy array of audio samples. Returns a list[dict] of the running transcripts with the following keys: text (string) -- the transcript of the current sentence words (list[dict]) -- a list of word dicts that make up the sentence end (bool) -- if true, end-of-sentence due to silence Each transcript represents one phrase/sentence. When a sentence has been determined to be ended, it will be marked with end=True. Multiple sentence transcripts can be returned if one just ended and another is beginning. """ samples = audio_to_int16(samples) self.request_queue.put(rasr.StreamingRecognizeRequest(audio_content=samples.tobytes())) transcripts = [] while True: try: transcripts.append(self.responses_queue.get(block=True, timeout=self.config.response_timeout)) except queue.Empty: break return transcripts def __next__(self): """ Retrieve the next request containing audio samples to send to the Riva server. This is implemented using an iterator interface as that is what GRPC expects. """ try: request = self.request_queue.get(block=True, timeout=self.config.request_timeout) return request except queue.Empty: logging.debug(f'{self.config.request_timeout} second timeout occurred waiting for audio samples, stopping Riva ASR service') raise StopIteration def recieve_responses(self): """ Wait to recieve responses from the Riva server and parse them. """ logging.debug('starting Riva ASR service response reciever thread') for response in self.responses: # this is blocking if not response.results: continue result = response.results[0] if not result.alternatives: continue text = result.alternatives[0].transcript text = text.strip() if len(text) == 0: continue self.responses_queue.put({ 'text' : text, 'end' : result.is_final }) logging.debug('exiting Riva ASR service response reciever thread') @property def sample_rate(self): """ The sample rate that the model runs at (in Hz) Input audio should be resampled to this rate. """ return self.config.sample_rate @property def frame_length(self): """ Duration in seconds per frame / chunk. """ return self.config.frame_length @property def chunk_size(self): """ Number of samples per frame/chunk (equal to frame_length * sample_rate) """ return int(self.frame_length * self.sample_rate) ================================================ FILE: jetson_voice/backends/riva/riva_tts.py ================================================ #!/usr/bin/env python3 # coding: utf-8 import os import grpc import logging import numpy as np import riva_api.audio_pb2 as ra import riva_api.riva_tts_pb2 as rtts import riva_api.riva_tts_pb2_grpc as rtts_srv from jetson_voice import TTSService class RivaTTSService(TTSService): """ Riva streaming TTS service. """ def __init__(self, config, *args, **kwargs): """ Open a streaming channel to the Riva server for TTS. This establishes a connection over GRPC and sends/recieves the requests and responses. """ super(RivaTTSService, self).__init__(config, *args, **kwargs) self.config.setdefault('server', 'localhost:50051') self.config.setdefault('sample_rate', 22050) # ignored (will always be 22.05KHz) self.config.setdefault('voice_name', 'ljspeech') # ignored self.config.setdefault('language_code', 'en-US') logging.info(f'Riva TTS service config:\n{self.config}') self.channel = grpc.insecure_channel(self.config.server) self.client = rtts_srv.RivaSpeechSynthesisStub(self.channel) def __call__(self, text): """ Generate audio from text. Parameters: text (string) -- The phrase to convert to audio. Returns audio samples in a numpy array. """ req = rtts.SynthesizeSpeechRequest() req.text = text req.language_code = self.config.language_code req.sample_rate_hz = self.config.sample_rate req.voice_name = self.config.voice_name req.encoding = ra.AudioEncoding.LINEAR_PCM resp = self.client.Synthesize(req) samples = np.frombuffer(resp.audio, dtype=np.float32) return samples @property def sample_rate(self): """ Get the output sample rate (in Hz) """ return self.config.sample_rate ================================================ FILE: jetson_voice/backends/tensorrt/__init__.py ================================================ #!/usr/bin/env python3 # coding: utf-8 from .trt_model import TRTModel ================================================ FILE: jetson_voice/backends/tensorrt/trt_binding.py ================================================ #!/usr/bin/env python3 # coding: utf-8 import logging import tensorrt as trt import pycuda.driver as cuda import pycuda.autoinit class Binding: """ Represents an input/output tensor to the model. """ def __init__(self, model, index): """ Parameters: model (TRTModel) -- parent model instance index (int) -- index of the binding in the model """ self.model = model self.index = index self.name = model.trt_engine.get_binding_name(index) self.shape = tuple(model.trt_engine.get_binding_shape(index)) self.dtype = model.trt_engine.get_binding_dtype(index) self.input = model.trt_engine.binding_is_input(index) self.size = max(trt.volume(self.shape) * self.dtype.itemsize, 0) self.dynamic = (self.size <= 0) self.profiles = [] if self.input: for i in range(model.trt_engine.num_optimization_profiles): profile = model.trt_engine.get_profile_shape(i, index) self.profiles.append(dict( min = profile[0], opt = profile[1], max = profile[2])) self.alloc() def alloc(self, shape=None): """ Allocate memory for the binding. alloc() is called automatically when needed. If new shape is provided, it will update the internal state. """ if shape is not None: self.shape = shape self.size = trt.volume(self.shape) * self.dtype.itemsize if self.size <= 0: # dynamic with shape not yet set self.host = None self.device = None return self.host = None if self.input else cuda.pagelocked_empty(self.shape, dtype=trt.nptype(self.dtype)) self.device = cuda.mem_alloc(self.size) def set_shape(self, shape): """ Set the shape of a dynamic input binding. """ if not self.dynamic: raise ValueError(f"binding '{self.name}' is not dynamic") if not self.input: raise ValueError(f"binding '{self.name}' is not an input") # check to see if the shape already matches if self.shape == shape: logging.debug(f"binding '{self.name}' already has shape {shape}") return logging.debug(f"binding '{self.name}' has new shape {shape}") # set the new shape if not self.model.trt_context.set_binding_shape(self.index, shape): raise ValueError(f"failed to set binding '{self.name}' with shape {shape}") # re-allocate tensor memory self.alloc(shape) def query_shape(self): """ Updates the shape of a dynamic output binding. """ if not self.dynamic: return if self.input: raise ValueError(f"binding '{self.name}' is not an output") # get the new shape shape = tuple(self.model.trt_context.get_binding_shape(self.index)) # check to see if the shape already matches if self.shape == shape: logging.debug(f"binding '{self.name}' already has shape {shape}") return logging.debug(f"binding '{self.name}' has new output shape {shape}") # re-allocate tensor memory self.alloc(shape) return shape def __str__(self): return ( f"binding {self.index} - '{self.name}'\n" f" input: {self.input}\n" f" shape: {self.shape}\n" f" dtype: {self.dtype}\n" f" size: {self.size}\n" f" dynamic: {self.dynamic}\n" f" profiles: {self.profiles}\n" ) ================================================ FILE: jetson_voice/backends/tensorrt/trt_builder.py ================================================ #!/usr/bin/env python3 # coding: utf-8 import os import time import json import logging import tensorrt as trt import pycuda.driver as cuda import pycuda.autoinit TRT_LOGGER = trt.Logger(trt.Logger.VERBOSE) def build_engine(config, output=None, precision='fp16', batch_size=1, dynamic_shapes=None, workspace=128, parse_only=False): """ Build TensorRT engine from ONNX model. Parameters: model (string) -- path to ONNX model config (string) -- path to model configuration json (will be inferred from model path if empty) output (string) -- path to output serialized TensorRT engine (will be inferred from model path if empty) precision (string) -- fp32 or fp16 (int8 not currently supported) batch_size (int) -- the maximum batch size (default 1) dynamic_shape (dict) -- dynamic shape profiles for min/max/opt workspace (int) -- builder workspace memory size (in MB) parse_only (bool) -- if true, test parsing the model before exiting without building the TensorRT engine Returns the built TensorRT engine (ICudaEngine) """ # set default output path if output is None or output == '': output = f'{os.path.splitext(config.model_path)[0]}.engine' # create TensorRT resources builder = trt.Builder(TRT_LOGGER) builder_config = builder.create_builder_config() network = builder.create_network(1 << (int)(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)) parser = trt.OnnxParser(network, TRT_LOGGER) builder_config.max_workspace_size = workspace * 1 << 20 # set precision precision = precision.lower() if precision == 'fp16': builder_config.set_flag(trt.BuilderFlag.FP16) logging.info(f'enabled FP16 precision') elif precision == 'int8': # https://github.com/NVIDIA/TensorRT/blob/d7baf010e4396c87d58e4d8a33052c01c2d89325/demo/BERT/builder.py#L592 raise NotImplementedError('INT8 support not yet implemented') # load the model (from ONNX) logging.info(f'loading {config.model_path}') with open(config.model_path, 'rb') as model_file: if not parser.parse(model_file.read()): logging.error(f'failed to parse ONNX model {config.model_path}') for error in range(parser.num_errors): print (parser.get_error(error)) return None # create dynamic shape profile # TODO refactor this to an abstract .get_dynamic_shapes() implementation in each subclass # TODO this currently uses same shape for all inputs - allow for different shape profiles profile = builder.create_optimization_profile() opt_shape = None """ if model_type == 'qa' or model_type == 'text_classification' or model_type == 'token_classification': min_shape = (1, 1) # (batch_size, sequence_length) max_shape = (batch_size, model_config['dataset']['max_seq_length']) elif model_type == 'intent_slot': min_shape = (1, 1) # (batch_size, sequence_length) max_shape = (batch_size, model_config['language_model']['max_seq_length']) elif model_type == 'asr': features = model_config['preprocessor']['features'] sample_rate = model_config['preprocessor']['sample_rate'] sample_to_fft = 1.0 / 160.0 # rough conversion from samples to MEL spectrogram dims sample_multiplier = sample_rate * sample_to_fft min_shape = (batch_size, features, int(0.5 * sample_multiplier)) # minimum plausible frame length opt_shape = (batch_size, features, int(1.2 * sample_multiplier)) # default of .1s overlap factor (1,64,121) max_shape = (batch_size, features, int(3.0 * sample_multiplier)) # enough for 1s overlap factor elif model_type == 'asr_classification': features = model_config['preprocessor']['n_mels'] sample_rate = model_config['sample_rate'] sample_to_fft = 1.0 / 160.0 # rough conversion from samples to MEL spectrogram dims sample_multiplier = sample_rate * sample_to_fft min_shape = (batch_size, features, int(0.5 * sample_multiplier)) # minimum plausible frame length opt_shape = (batch_size, features, int(1.2 * sample_multiplier)) # default of .1s overlap factor (1,64,121) max_shape = (batch_size, features, int(3.0 * sample_multiplier)) # enough for 1s overlap factor elif model_type == 'tts_vocoder': min_shape = (batch_size, model_config['features'], 1) opt_shape = (batch_size, model_config['features'], 160) # ~5-6 words max_shape = (batch_size, model_config['features'], 512) # ~15-20 words? else: raise NotImplementedError(f"model type '{model_type}' is unrecognized or not supported") """ # TODO support different shape profiles for different input tensors if dynamic_shapes is not None: if 'min' not in dynamic_shapes: dynamic_shapes['min'] = dynamic_shapes['max'] if 'opt' not in dynamic_shapes: dynamic_shapes['opt'] = dynamic_shapes['max'] for i in range(network.num_inputs): # TODO confirm that input is in fact dynamic profile.set_shape(network.get_input(i).name, min=dynamic_shapes['min'], opt=dynamic_shapes['opt'], max=dynamic_shapes['max']) builder_config.add_optimization_profile(profile) def print_summary(): print('') print('----------------------------------------------------') print(' BUILDER CONFIGURATION') print('----------------------------------------------------') print(f' - model {config.model_path}') print(f' - config {config.path}') print(f' - output {output}') print(f' - type {config.type}') print(f' - layers {network.num_layers}') print(f' - inputs {network.num_inputs}') print(f' - outputs {network.num_outputs}') print(f' - precision {precision}') print(f' - workspace {workspace}') print('') for i in range(network.num_inputs): tensor = network.get_input(i) print(f' - input {i}:') print(f' - name {tensor.name}') print(f' - shape {tensor.shape}') print(f' - dtype {tensor.dtype}') for i in range(network.num_outputs): tensor = network.get_output(i) print(f' - output {i}:') print(f' - name {tensor.name}') print(f' - shape {tensor.shape}') print(f' - dtype {tensor.dtype}') print_summary() if parse_only: return None # build the engine build_start_time = time.time() engine = builder.build_engine(network, builder_config) if engine is None: raise ValueError(f"failed to build TensorRT engine for '{config.model_path}'") build_time_elapsed = (time.time() - build_start_time) print(f'\nbuilt engine in {build_time_elapsed} seconds') print_summary() # save engine print('\nserializing engine...') serialized_engine = engine.serialize() with open(output, "wb") as engine_file: engine_file.write(serialized_engine) print(f'saved engine to {output}') return engine ''' if __name__ == "__main__": import argparse parser = argparse.ArgumentParser() parser.add_argument('--config', default='', type=str) parser.add_argument('--output', default='', type=str) parser.add_argument('--precision', default='fp16', choices=['fp32', 'fp16', 'int8'], type=str) parser.add_argument('--batch-size', default=1, type=int) # max batch size parser.add_argument('--workspace', default=utils.DEFAULT_WORKSPACE, type=int) parser.add_argument('--parse-only', action='store_true') args = parser.parse_args() print(args) build_engine(config=args.config, output=args.output, precision=args.precision, batch_size=args.batch_size, workspace=args.workspace, parse_only=args.parse_only) ''' ================================================ FILE: jetson_voice/backends/tensorrt/trt_model.py ================================================ #!/usr/bin/env python3 # coding: utf-8 import os import time import json import logging import pprint import numpy as np import tensorrt as trt import pycuda.driver as cuda import pycuda.autoinit from .trt_builder import build_engine, TRT_LOGGER from .trt_binding import Binding class TRTModel: """ Base class for TensorRT models. """ def __init__(self, config, dynamic_shapes=None, *args, **kwargs): """ Load a TensorRT model from ONNX or serialized TensorRT engine. Parameters: config (ConfigDict) -- configuration dict dynamic_shapes (dict) -- dynamic shape profiles for min/max/opt """ self.config = config # determine if the TensorRT engine already exists model_root, model_ext = os.path.splitext(self.config.model_path) model_ext = model_ext.lower() if model_ext == '.onnx': engine_path = model_root + '.engine' if os.path.exists(engine_path): logging.info(f'loading cached TensorRT engine from {engine_path}') self.config.model_path = engine_path model_ext = '.engine' # either build or load TensorRT engine if model_ext == '.onnx': self.trt_engine = build_engine(self.config, dynamic_shapes=dynamic_shapes) elif model_ext == '.engine' or model_ext == '.plan': with open(self.config.model_path, 'rb') as f: self.trt_runtime = trt.Runtime(TRT_LOGGER) self.trt_engine = self.trt_runtime.deserialize_cuda_engine(f.read()) else: raise ValueError(f"invalid model extension '{model_ext}' (should be .onnx, .engine, or .plan)") if self.trt_engine is None: raise IOError(f'failed to load TensorRT engine from {self.model_path}') self.trt_context = self.trt_engine.create_execution_context() logging.info(f'loaded TensorRT engine from {self.config.model_path}') # create a stream in which to copy inputs/outputs and run inference self.stream = cuda.Stream() # enumerate bindings self.bindings = [] self.inputs = [] self.outputs = [] for i in range(len(self.trt_engine)): binding = Binding(self, i) self.bindings.append(binding) if binding.input: self.inputs.append(binding) else: self.outputs.append(binding) for binding in self.bindings: print(f'\n{binding}') def execute(self, inputs, sync=True, return_dict=False, **kwargs): """ Run the DNN model in TensorRT. The inputs are provided as numpy arrays in a list/tuple/dict. Note that run() doesn't perform any pre/post-processing - this is typically done in subclasses. Parameters: inputs (array, list[array], dict[array]) -- the network inputs as numpy array(s). If there is only one input, it can be provided as a single numpy array. If there are multiple inputs, they can be provided as numpy arrays in a list, tuple, or dict. Inputs in lists and tuples are assumed to be in the same order as the input bindings. Inputs in dicts should have keys with the same names as the input bindings. sync (bool) -- If True (default), will wait for the GPU to be done processing before returning. return_dict (bool) -- If True, the results will be returned in a dict of numpy arrays, where the keys are the names of the output binding names. By default, the results will be returned in a list of numpy arrays, in the same order as the output bindings. Returns the model output as a numpy array (if only one output), list[ndarray], or dict[ndarray]. """ if isinstance(inputs, np.ndarray): inputs = [inputs] assert len(inputs) == len(self.inputs) # setup inputs + copy to GPU def setup_binding(binding, input): input = input.astype(trt.nptype(binding.dtype), copy=False) if binding.dynamic: binding.set_shape(input.shape) cuda.memcpy_htod_async(binding.device, np.ascontiguousarray(input), self.stream) if isinstance(inputs, (list,tuple)): for idx, input in enumerate(inputs): setup_binding(self.bindings[idx], input) elif isinstance(inputs, dict): for binding_name in inputs: setup_binding(self.find_binding(binding_name), inputs[binding_name]) else: raise ValueError(f"inputs must be a list, tuple, or dict (instead got type '{type(inputs).__name__}')") assert self.trt_context.all_binding_shapes_specified assert self.trt_context.all_shape_inputs_specified # query new dynamic output shapes for output in self.outputs: output.query_shape() # run inference self.trt_context.execute_async_v2( bindings=[int(binding.device) for binding in self.bindings], stream_handle=self.stream.handle ) # copy outputs to CPU for output in self.outputs: cuda.memcpy_dtoh_async(output.host, output.device, self.stream) # wait for completion if sync: self.stream.synchronize() # return results if return_dict: results = {} for output in self.outputs: results[output.name] = output.host return results else: if len(self.outputs) == 1: return self.outputs[0].host else: return tuple([output.host for output in self.outputs]) def find_binding(self, name): """ Lookup an input/output binding by name """ for binding in self.bindings: if binding.name == name: return binding logging.error(f"couldn't find binding with name '{name}'") return None def set_shape(self, binding, shape): """ Set the shape of a dynamic binding. """ if isinstance(binding, int): binding = self.bindings[binding] elif isinstance(binding, str): binding = self.find_binding(binding) elif not isinstance(binding, dict): raise ValueError(f'binding must be specified as int, string, or dict (got {type(binding).__name__})') binding.set_shape(shape) ================================================ FILE: jetson_voice/models/__init__.py ================================================ #!/usr/bin/env python3 # coding: utf-8 from .asr import ASREngine from .nlp import IntentSlotEngine, QuestionAnswerEngine, TextClassificationEngine, TokenClassificationEngine from .tts import TTSEngine ================================================ FILE: jetson_voice/models/asr/__init__.py ================================================ #!/usr/bin/env python3 # coding: utf-8 from .asr_engine import ASREngine ================================================ FILE: jetson_voice/models/asr/asr_engine.py ================================================ #!/usr/bin/env python3 # coding: utf-8 import os import time import pprint import logging import importlib import torch import numpy as np from .ctc_decoder import CTCDecoder from jetson_voice.asr import ASRService from jetson_voice.utils import audio_to_float, global_config, load_model, softmax class ASREngine(ASRService): """ Streaming ASR (Automatic Speech Recognition) model in TensorRT or onnxruntime. This model is primarily designed to be used on a live audio source like a microphone. """ def __init__(self, config, *args, **kwargs): """ Loads a streaming ASR model from ONNX or serialized TensorRT engine. Parameters: model (string) -- path to ONNX model or serialized TensorRT engine/plan config (string) -- path to model configuration json (will be inferred from model path if empty) """ super(ASREngine, self).__init__(config, *args, **kwargs) if self.config.type != 'asr' and self.config.type != 'asr_classification': raise ValueError(f"{self.config.model_path} isn't an ASR model (type '{self.config.type}'") # set some default config options that are non-standard in nemo if 'streaming' not in self.config: self.config['streaming'] = {} self.config['streaming'].setdefault('frame_length', 1.0) # duration of signal frame, seconds (TODO shorter defaults for VAD/command classifiers) self.config['streaming'].setdefault('frame_overlap', 0.5) # duration of overlap before/after current frame, seconds # some config changes for streaming if not self.classification: self.config['preprocessor']['dither'] = 0.0 self.config['preprocessor']['pad_to'] = 0 if 'ctc_decoder' not in self.config: self.config['ctc_decoder'] = {} self.config['ctc_decoder'].setdefault('type', 'greedy') # greedy or beamsearch self.config['ctc_decoder'].setdefault('add_punctuation', True) # add period to the end of sentences if 'add_punctuation' in kwargs: self.config['ctc_decoder']['add_punctuation'] = kwargs['add_punctuation'] logging.info(f"add_punctuation = {kwargs['add_punctuation']}") if not self.classification and self.config['preprocessor']['features'] == 64: # TODO normalization coefficients for citrinet (N=80) normalization = {} normalization['fixed_mean'] = [ -14.95827016, -12.71798736, -11.76067913, -10.83311182, -10.6746914, -10.15163465, -10.05378331, -9.53918999, -9.41858904, -9.23382904, -9.46470918, -9.56037, -9.57434245, -9.47498732, -9.7635205, -10.08113074, -10.05454561, -9.81112681, -9.68673603, -9.83652977, -9.90046248, -9.85404766, -9.92560366, -9.95440354, -10.17162966, -9.90102482, -9.47471025, -9.54416855, -10.07109475, -9.98249912, -9.74359465, -9.55632283, -9.23399915, -9.36487649, -9.81791084, -9.56799225, -9.70630899, -9.85148006, -9.8594418, -10.01378735, -9.98505315, -9.62016094, -10.342285, -10.41070709, -10.10687659, -10.14536695, -10.30828702, -10.23542833, -10.88546868, -11.31723646, -11.46087382, -11.54877829, -11.62400934, -11.92190509, -12.14063815, -11.65130117, -11.58308531, -12.22214663, -12.42927197, -12.58039805, -13.10098969, -13.14345864, -13.31835645, -14.47345634] normalization['fixed_std'] = [ 3.81402054, 4.12647781, 4.05007065, 3.87790987, 3.74721178, 3.68377423, 3.69344, 3.54001005, 3.59530412, 3.63752368, 3.62826417, 3.56488469, 3.53740577, 3.68313898, 3.67138151, 3.55707266, 3.54919572, 3.55721289, 3.56723346, 3.46029304, 3.44119672, 3.49030548, 3.39328435, 3.28244406, 3.28001423, 3.26744937, 3.46692348, 3.35378948, 2.96330901, 2.97663111, 3.04575148, 2.89717604, 2.95659301, 2.90181116, 2.7111687, 2.93041291, 2.86647897, 2.73473181, 2.71495654, 2.75543763, 2.79174615, 2.96076456, 2.57376336, 2.68789782, 2.90930817, 2.90412004, 2.76187531, 2.89905006, 2.65896173, 2.81032176, 2.87769857, 2.84665271, 2.80863137, 2.80707634, 2.83752184, 3.01914511, 2.92046439, 2.78461139, 2.90034605, 2.94599508, 2.99099718, 3.0167554, 3.04649716, 2.94116777] self.config['preprocessor']['normalize'] = normalization # create preprocessor instance preprocessor_name = self.config['preprocessor']['_target_'].rsplit(".", 1) preprocessor_class = getattr(importlib.import_module(preprocessor_name[0]), preprocessor_name[1]) logging.debug(f'ASR preprocessor - {preprocessor_class}') preprocessor_config = self.config['preprocessor'].copy() preprocessor_config.pop('_target_') self.preprocessor = preprocessor_class(**preprocessor_config) # load the model features = self.config.preprocessor.n_mels if self.classification else self.config.preprocessor.features time_to_fft = self.sample_rate * (1.0 / 160.0) # rough conversion from samples to MEL spectrogram dims dynamic_shapes = { 'min' : (1, features, int(0.1 * time_to_fft)), # minimum plausible frame length 'opt' : (1, features, int(1.5 * time_to_fft)), # default of .5s overlap factor (1,64,121) 'max' : (1, features, int(3.0 * time_to_fft)) # enough for 2s overlap factor } self.model = load_model(self.config, dynamic_shapes) # create CTC decoder if not self.classification: self.ctc_decoder = CTCDecoder.from_config(self.config['ctc_decoder'], self.config['decoder']['vocabulary'], os.path.dirname(self.config.model_path)) logging.info(f"CTC decoder type: '{self.ctc_decoder.type}'") # create streaming buffer self.n_frame_len = int(self.frame_length * self.sample_rate) self.n_frame_overlap = int(self.frame_overlap * self.sample_rate) self.buffer_length = self.n_frame_len + self.n_frame_overlap self.buffer_duration = self.buffer_length / self.sample_rate self.buffer = np.zeros(shape=self.buffer_length, dtype=np.float32) # 2*self.n_frame_overlap def __call__(self, samples): """ Transcribe streaming audio samples to text, returning the running phrase. Phrases are broken up when a break in the audio is detected (i.e. end of sentence) Parameters: samples (array) -- Numpy array of audio samples. Returns a dict of the running phrase. transcript (string) -- the current transcript latest (string) -- the latest additions to the transcript end (bool) -- if true, end-of-sequence due to silence """ samples = audio_to_float(samples) if len(samples) < self.n_frame_len: samples = np.pad(samples, [0, self.n_frame_len - len(samples)], 'constant') self.buffer[:self.n_frame_overlap] = self.buffer[-self.n_frame_overlap:] self.buffer[self.n_frame_overlap:] = samples if global_config.profile: preprocess_begin = time.perf_counter() # apply pre-processing preprocessed_signal, _ = self.preprocessor( input_signal=torch.as_tensor(self.buffer, dtype=torch.float32).unsqueeze(dim=0), length=torch.as_tensor(self.buffer.size, dtype=torch.int64).unsqueeze(dim=0) ) if global_config.profile: logging.info(f'preprocess time: {time.perf_counter() - preprocess_begin}') network_begin = time.perf_counter() # run the asr model logits = self.model.execute(torch_to_numpy(preprocessed_signal)) logits = np.squeeze(logits) logits = softmax(logits, axis=-1) if global_config.profile: logging.info(f'network time: {time.perf_counter() - network_begin}') self.timestep_duration = self.buffer_duration / logits.shape[0] self.n_timesteps_frame = int(self.frame_length / self.timestep_duration) self.n_timesteps_overlap = int(self.frame_overlap / self.timestep_duration) if self.classification: argmax = np.argmax(logits) prob = logits[argmax] return (self.config['labels'][argmax], prob) else: self.ctc_decoder.set_timestep_duration(self.timestep_duration) self.ctc_decoder.set_timestep_delta(self.n_timesteps_frame) if global_config.profile: ctc_decoder_begin = time.perf_counter() transcripts = self.ctc_decoder.decode(logits) if global_config.profile: logging.info(f'ctc_decoder time: {time.perf_counter() - ctc_decoder_begin}') return transcripts @property def classification(self): """ Returns true if this is an ASR classification model. """ return self.config.type == 'asr_classification' @property def sample_rate(self): """ The sample rate that the model runs at. Input audio should be resampled to this rate. """ return self.config['sample_rate'] if self.classification else self.config['preprocessor']['sample_rate'] @property def frame_length(self): """ Duration in seconds per frame / chunk. """ return self.config['streaming']['frame_length'] @property def frame_overlap(self): """ Duration of overlap in seconds before/after current frame. """ return self.config['streaming']['frame_overlap'] @property def chunk_size(self): """ Number of samples per frame/chunk (equal to frame_length * sample_rate) """ return self.n_frame_len def torch_to_numpy(tensor): return tensor.detach().cpu().numpy() if tensor.requires_grad else tensor.cpu().numpy() ================================================ FILE: jetson_voice/models/asr/ctc_beamsearch.py ================================================ #!/usr/bin/env python3 # coding: utf-8 import os import logging from .ctc_decoder import CTCDecoder from .ctc_utils import find_silent_intervals, merge_words, rebase_word_times, split_words, transcript_from_words from ctc_decoders import Scorer from swig_decoders import BeamDecoder, ctc_beam_search_decoder_ex from jetson_voice.utils import global_config class CTCBeamSearchDecoder(CTCDecoder): """ CTC beam search decoder that optionally uses a language model. """ def __init__(self, config, vocab, resource_path=None): """ Create a new CTCBeamSearchDecoder. See CTCDecoder.from_config() to automatically create the correct type of instance dependening on config. """ super().__init__(config, vocab) self.config.setdefault('word_threshold', -1000.0) self.reset() self.scorer = None #self.num_cores = max(os.cpu_count(), 1) # set default config # https://github.com/NVIDIA/NeMo/blob/855ce265b80c0dc40f4f06ece76d2c9d6ca1be8d/nemo/collections/asr/modules/beam_search_decoder.py#L21 self.config.setdefault('language_model', None) self.config.setdefault('beam_width', 32)#128) self.config.setdefault('alpha', 0.7 if self.language_model else 0.0) self.config.setdefault('beta', 0.0) self.config.setdefault('cutoff_prob', 1.0) self.config.setdefault('cutoff_top_n', 40) self.config.setdefault('top_k', 3) # check for language model file if self.language_model: if not os.path.isfile(self.language_model): self.config['language_model'] = os.path.join(resource_path, self.language_model) if not os.path.isfile(self.language_model): raise IOError(f"language model file '{self.language_model}' does not exist") logging.info('creating CTCBeamSearchDecoder') logging.info(str(self.config)) # create scorer if self.language_model: self.scorer = Scorer(self.config['alpha'], self.config['beta'], model_path=self.language_model, vocabulary=self.vocab) def decode(self, logits): """ Decode logits into words, and merge the new words with the previous words from the running transcript. Returns the running transcript as a list of word dictionaries, where each word dict has he following keys: 'text' (str) -- the text of the word 'score' (float) -- the probability of the word 'start_time' (int) -- the start time of the word (in timesteps) 'end_time' (int) -- the end time of the word (in timesteps) Note that the start/end times are transformed from timestamps into seconds by the ASR engine after CTCDecoder.decode() is called. """ results = ctc_beam_search_decoder_ex( logits.tolist(), self.vocab, self.config['beam_width'], self.config['cutoff_prob'], self.config['cutoff_top_n'], self.config['top_k'], self.timestep, self.scorer) if global_config.debug: print('BeamSearch results', len(results)) for idx, result in enumerate(results): print(f" beam {idx} [{result.score:.3f}] '{result.text}'") for word_idx, word in enumerate(result.words): print(f" word {word_idx} [{word.start_time}:{word.end_time} {word.score:.3f}] '{word.text}'") words = [{ 'text' : word.text, 'score' : word.score, 'start_time' : word.start_time, 'end_time' : word.end_time } for word in results[0].words] # merge new words with past words self.words = merge_words(self.words, words, self.config['word_threshold'], 'similarity') # look for silent/EOS intervals silent_intervals = find_silent_intervals(logits, len(self.vocab), self.timesteps_silence, self.timestep) if global_config.debug: print(f'silent intervals: {silent_intervals}') self.timestep += self.timestep_delta # split the words at EOS intervals if len(silent_intervals) > 0: wordlists = split_words(self.words, silent_intervals) transcripts = [] for idx, wordlist in enumerate(wordlists): # ignore blanks (silence after EOS has already occurred) if len(wordlist) == 0: continue # if there is only one wordlist, then it must be EOS # if there are multiple, then the last one is not EOS end = (len(wordlists) == 1) or (idx < (len(wordlists) - 1)) if end: wordlist = rebase_word_times(wordlist) self.reset() # TODO reset timesteps counter correctly else: self.words = wordlist transcripts.append((wordlist, end)) else: transcripts = [(self.words, False)] return [{ 'text' : transcript_from_words(words, scores=global_config.debug, times=global_config.debug, end=end, add_punctuation=self.config['add_punctuation']), 'words' : words, 'end' : end } for words, end in transcripts] def reset(self): """ Reset the CTC decoder state at EOS (end of sentence) """ #self.timestep = 0 #self.tail_silence = 0 self.words = [] @property def language_model(self): return self.config['language_model'] ================================================ FILE: jetson_voice/models/asr/ctc_decoder.py ================================================ #!/usr/bin/env python3 # coding: utf-8 class CTCDecoder: """ CTC decoder base class for ASR. """ @staticmethod def from_config(config, vocab, resource_path=None): """ Static factory function to instantiate the correct CTC decoder instance type from the config. config['type'] == 'greedy' -> CTCGreedyDecoder config['type'] == 'beamsearch' -> CTCBeamSearchDecoder """ type = config['type'].lower() if type == 'greedy': from .ctc_greedy import CTCGreedyDecoder return CTCGreedyDecoder(config, vocab) elif type == "beamsearch": from .ctc_beamsearch import CTCBeamSearchDecoder return CTCBeamSearchDecoder(config, vocab, resource_path) else: raise ValueError(f"invalid/unrecognized CTC decoder type '{type}'") def __init__(self, config, vocab): """ See CTCDecoder.from_config() to automatically create the correct type of instance dependening on config. """ self.config = config self.vocab = vocab self.timestep = 0 self.config.setdefault('vad_eos_duration', 0.65) # max silent time until end-of-sentence self.config.setdefault('timestep_offset', 5) # number of symbols to drop for smooth streaming def decode(self, logits): """ Decode logits into words, and merge the new words with the previous words from the running transcript. Returns the running transcript as a list of word dictionaries, where each word dict has he following keys: 'text' (str) -- the text of the word 'score' (float) -- the probability of the word 'start_time' (int) -- the start time of the word (in timesteps) 'end_time' (int) -- the end time of the word (in timesteps) Note that the start/end times are transformed from timestamps into seconds by the ASR engine after CTCDecoder.decode() is called. """ pass def reset(self): """ Reset the CTC decoder state at EOS (end of sentence) """ pass def set_timestep(self, timestep): """ Set the current timestep. """ self.timestep = timestep def set_timestep_delta(self, offset): """ Set the number of timesteps per frame. """ self.timestep_delta = offset - self.config['timestep_offset'] def set_timestep_duration(self, duration): """ Set the duration of each timestep, in seconds. """ self.timestep_duration = duration self.timesteps_silence = self.config['vad_eos_duration'] / self.timestep_duration @property def type(self): """ Return the CTC decoder type string ('greedy' or 'beamsearch') """ return self.config['type'].lower() ================================================ FILE: jetson_voice/models/asr/ctc_greedy.py ================================================ #!/usr/bin/env python3 # coding: utf-8 import string import numpy as np from .ctc_decoder import CTCDecoder from .ctc_utils import merge_words, transcript_from_words from jetson_voice.utils import global_config class CTCGreedyDecoder(CTCDecoder): """ CTC greedy decoder that simply chooses the highest-probability logits. """ def __init__(self, config, vocab): """ Create a new CTCGreedyDecoder. TODO document config. See CTCDecoder.from_config() to automatically create the correct type of instance dependening on config. """ super().__init__(config, vocab) self.config.setdefault('word_threshold', 0.1) # add blank symbol to vocabulary if '_' not in vocab: self.vocab = vocab.copy() self.vocab.append('_') self.reset() def decode(self, logits): """ Decode logits into words, and merge the new words with the previous words from the running transcript. Returns the running transcript as a list of word dictionaries, where each word dict has he following keys: 'text' (str) -- the text of the word 'score' (float) -- the probability of the word 'start_time' (int) -- the start time of the word (in timesteps) 'end_time' (int) -- the end time of the word (in timesteps) Note that the start/end times are transformed from timestamps into seconds by the ASR engine after CTCDecoder.decode() is called. """ text = [] prob = 1.0 probs = [] # select the chars with the max probability for i in range(logits.shape[0]): argmax = np.argmax(logits[i]) text.append(self.vocab[argmax]) probs.append(logits[i][argmax]) if global_config.debug: print(text) # get the max number of sequential silent timesteps (continuing from last frame) silent_timesteps = self.end_silent_timesteps max_silent_timesteps = 0 for i in range(len(text)): if text[i] == '_': silent_timesteps += 1 else: max_silent_timesteps = max(silent_timesteps, max_silent_timesteps) if i > 0 else 0 silent_timesteps = 0 if text[-1] == '_': self.end_silent_timesteps = silent_timesteps # merge repeating chars and blank symbols _, words = self.merge_chars(text, probs) #text[:len(text)-self.config['offset']] # merge new words with past words words = merge_words(self.words, words, self.config['word_threshold'], 'overlap') # increment timestep (after this frame's timestep is done being used, and before a potential EOS reset) self.timestep += self.timestep_delta # check for EOS end = False if silent_timesteps > self.timesteps_silence: end = True self.reset() else: self.words = words return [{ 'text' : transcript_from_words(words, scores=global_config.debug, times=global_config.debug, end=end, add_punctuation=self.config['add_punctuation']), 'words' : words, 'end' : end }] def merge_chars(self, text, probs): """ Merge repeating chars and blank symbols into words. """ text_merged = '' word = None words = [] def ispunct(ch): return ch in (string.punctuation + ' ') for i in range(len(text)): if text[i] != self.prev_char and text[i] != '_': self.prev_char = text[i] if text[i] != '_': text_merged += text[i] if not ispunct(text[i]): if word is None: word = { 'text' : text[i], 'score' : probs[i], 'start_char' : len(text_merged) - 1, 'end_char' : len(text_merged), 'start_time' : self.timestep + i, 'end_time' : self.timestep + i + 1 } else: word['text'] += text[i] word['score'] *= probs[i] word['end_char'] = len(text_merged) word['end_time'] = self.timestep + i + 1 if ispunct(text[i]) and word is not None: words.append(word) word = None if word is not None: words.append(word) return text_merged, words def reset(self): """ Reset the CTC decoder state at EOS (end of sentence) """ self.prev_char = '' self.end_silent_timesteps = 0 self.timestep = 0 self.words = [] ================================================ FILE: jetson_voice/models/asr/ctc_utils.py ================================================ #!/usr/bin/env python3 # coding: utf-8 import nltk import numpy as np from jetson_voice.utils import global_config def transcript_from_words(words, scores=False, times=False, end=False, add_punctuation=True): """ Convert a list of words to the text transcript. """ transcript = '' for idx, word in enumerate(words): if scores and times: transcript += f"{word['text']} ({word['start_time']}:{word['end_time']} {word['score']:.2f})" elif scores: transcript += f"{word['text']} ({word['score']:.2f})" elif times: transcript += f"{word['text']} ({word['start_time']}:{word['end_time']})" else: transcript += word['text'] if idx < len(words) - 1: transcript += ' ' if end and add_punctuation: transcript += '.' # add punctuation to end return transcript def find_overlapping_word(wordlist, word): """ Find the first word from the list with overlapping times. Returns a (word, index) tuple or (None, -1) if no overlap found. """ for idx, word2 in enumerate(wordlist): if not (word['end_time'] < word2['start_time'] or word['start_time'] > word2['end_time']): return word2, idx return None, -1 def find_word_after(wordlist, time): """ Find the nearest word that starts after the time. Returns a (word, index) tuple or (None, 1) if all words start before the time. """ if isinstance(time, tuple): time = time[1] # use the end time for idx, word in enumerate(wordlist): if time <= word['start_time']: return word, idx return None, -1 def find_word_before(wordlist, time): """ Find the nearest word that starts after the time. Returns a (word, index) tuple or (None, 1) if all words start after the time. """ if isinstance(time, tuple): time = time[0] # use the start time for idx in range(len(wordlist)-1, -1, -1): if time >= wordlist[idx]['end_time']: return wordlist[idx], idx return None, -1 def merge_words(wordlist, words, score_threshold=-np.inf, method='overlap'): """ Merge new words with past words. This works by finding overlapping or similar words, and replacing the old word with new word if the new word has a higher probability. """ if len(words) == 0: return wordlist if len(wordlist) == 0: return words # short-circuit if these are all new words if words[0]['start_time'] > wordlist[-1]['end_time']: wordlist.extend(words) return wordlist if method == 'overlap': # find words that overlap and pick the highest-scoring one for word in words: if word['score'] < score_threshold: #self.config['word_threshold']: continue if len(wordlist) == 0 or word['start_time'] > wordlist[-1]['end_time']: wordlist.append(word) continue overlap_word, overlap_idx = find_overlapping_word(wordlist, word) if overlap_word is None: continue if global_config.debug: print(f"found new '{word['text']}' ({word['start_time']}:{word['end_time']} {word['score']:.2f}) overlaps with '{overlap_word['text']}' ({overlap_word['start_time']}:{overlap_word['end_time']} {overlap_word['score']:.2f})") if word['score'] > overlap_word['score']: wordlist[overlap_idx] = word elif method == 'similarity': # find the most-similar past word to the first new word similarity_metric = np.inf #1000 similarity_index = -1 for idx in range(len(wordlist)-1, -1, -1): # search in reverse so words early in the transcript aren't matched first similarity = nltk.edit_distance(words[0]['text'], wordlist[idx]['text']) if similarity < similarity_metric: similarity_metric = similarity similarity_index = idx if similarity == 0: break if global_config.debug: print(f"closest word to '{words[0]['text']}' is '{wordlist[similarity_index]['text']}' (similarity={similarity_metric}) ") wordlist = wordlist[:similarity_index] wordlist.extend(words) else: raise ValueError(f"invalid method '{method}' (valid options are 'overlap', 'similarity')") return wordlist def split_words(wordlist, times): """ Split the word list by the given times. note - these times should be sorted """ wordlists = [] for time in times: _, idx = find_word_after(wordlist, time) if idx < 0: wordlists.append(wordlist) return wordlists wordlists.append(wordlist[:idx]) wordlist = wordlist[idx:] wordlists.append(wordlist) return wordlists def rebase_word_times(wordlist): """ Re-base the word timings so that the start of the first word is zero. """ if len(wordlist) == 0: return wordlist #wordlist = wordlist.copy() start_offset = wordlist[0]['start_time'] for idx in range(len(wordlist)): wordlist[idx]['start_time'] -= start_offset wordlist[idx]['end_time'] -= start_offset return wordlist def find_silent_intervals(logits, blank_symbol_id, min_silent_time, time_offset): """ Find blank/silent regions in the output logits. """ num_timesteps = logits.shape[0] silent_intervals = [] last_interval_start = None for i in range(num_timesteps): argmax = np.argmax(logits[i]) if argmax == blank_symbol_id: if last_interval_start is None: last_interval_start = i if last_interval_start is not None and (argmax != blank_symbol_id or (i == num_timesteps-1)): if i - last_interval_start >= min_silent_time: silent_intervals.append((last_interval_start + time_offset, i-1+time_offset)) # print(f' new silent interval ({last_interval_start + self.timestep}:{i-1+self.timestep}) {i - last_interval_start} > {min_length:.2f}') #else: # print(f'skipping silent interval ({last_interval_start + self.timestep}:{i-1+self.timestep}) {i - last_interval_start} < {min_length:.2f}') last_interval_start = None return silent_intervals ================================================ FILE: jetson_voice/models/nlp/__init__.py ================================================ #!/usr/bin/env python3 # coding: utf-8 from .intent_slot import IntentSlotEngine from .question_answer import QuestionAnswerEngine from .text_classification import TextClassificationEngine from .token_classification import TokenClassificationEngine ================================================ FILE: jetson_voice/models/nlp/intent_slot.py ================================================ #!/usr/bin/env python3 # coding: utf-8 import os import logging import numpy as np from transformers import AutoTokenizer from jetson_voice.nlp import IntentSlotService from jetson_voice.utils import load_model, normalize_logits from .nlp_utils import find_subtokens, nlp_dynamic_shapes class IntentSlotEngine(IntentSlotService): """ Joint Intent and Slot classification model in TensorRT / onnxruntime. """ def __init__(self, config, *args, **kwargs): """ Load an Intent/Slot classification model from ONNX """ super(IntentSlotEngine, self).__init__(config, *args, **kwargs) if self.config.type != 'intent_slot': raise ValueError(f"{self.config.model_path} isn't an Intent/Slot model (type '{self.config.type}'") # load model dynamic_shapes = {'max' : (1, self.config['language_model']['max_seq_length'])} # (batch_size, sequence_length) if nlp_dynamic_shapes: dynamic_shapes['min'] = (1, 1) self.model = load_model(self.config, dynamic_shapes) # create tokenizer self.tokenizer = AutoTokenizer.from_pretrained(self.config['tokenizer']['tokenizer_name']) self.null_slot = self.slot_labels[-1] # 'O' in assistant dataset - always the last label? def __call__(self, query): """ Perform intent/slot classification on the input query. Parameters: query (string) -- The text query, for example: 'What is the weather in San Francisco tomorrow?' Returns a dict with the following keys: 'intent' (string) -- the classified intent label 'score' (float) -- the intent probability [0,1] 'slots' (list[dict]) -- a list of dicts, where each dict has the following keys: 'slot' (string) -- the slot label 'text' (string) -- the slot text from the query 'score' (float) -- the slot probability [0,1] """ encodings = self.tokenizer( text=query, padding='longest' if nlp_dynamic_shapes else 'max_length', truncation=True, max_length=self.config['language_model']['max_seq_length'], return_tensors='np', return_token_type_ids=True, return_overflowing_tokens=True, return_offsets_mapping=True, return_special_tokens_mask=True, ) # during slot classification, we want to ignore slots from subtokens and special tokens subtoken_mask = find_subtokens(encodings, method='subtoken_delimiters') ignore_mask = subtoken_mask | encodings['special_tokens_mask'] # retrieve the inputs from the encoded tokens inputs = {} for input in self.model.inputs: if input.name not in encodings: raise ValueError(f"the encoded inputs from the tokenizer doesn't contain '{input.name}'") inputs[input.name] = encodings[input.name] # run the model intent_logits, slot_logits = self.model.execute(inputs) intent_logits = normalize_logits(intent_logits) slot_logits = normalize_logits(slot_logits) intent_preds = np.argmax(intent_logits, axis=-1) slot_preds = np.argmax(slot_logits, axis=-1) # convert numerical outputs to intent/slot labels results = [] for query_idx, intent_id in enumerate(intent_preds): results.append({ 'intent' : self.intent_label(intent_id), 'score' : intent_logits[query_idx][intent_id], 'slots' : [] }) for query_idx, slots in enumerate(slot_preds): query_slots = [self.slot_label(slot) for slot in slots] for token_idx, slot in enumerate(query_slots): # ignore unclassified slots or masked tokens if slot == self.null_slot or ignore_mask[query_idx][token_idx]: continue # convert from token index back to the query string chars = encodings.token_to_chars(query_idx, token_idx) text = query[chars[0]:chars[1]] # queries[query_idx] # append subtokens from the query to the text for subtoken_idx in range(token_idx+1, len(query_slots)): if subtoken_mask[query_idx][subtoken_idx]: subtoken_chars = encodings.token_to_chars(query_idx, subtoken_idx) text += query[subtoken_chars[0]:subtoken_chars[1]] else: break results[query_idx]['slots'].append({ 'slot' : slot, 'text' : text, 'score' : slot_logits[query_idx][token_idx][slots[token_idx]] }) if len(results) == 1: return results[0] else: return results @property def intent_labels(self): """ List of the intent class labels. """ return self.config['data_desc']['intent_labels'] def intent_label(self, index): """ Return an intent label by index (with bounds checking) """ return self.intent_labels[int(index)] if index < len(self.intent_labels) else 'Unknown_Intent' @property def slot_labels(self): """ List of the slot class labels. """ return self.config['data_desc']['slot_labels'] def slot_label(self, index): """ Return a slot label by index (with bounds checking) """ return self.slot_labels[int(index)] if index < len(self.slot_labels) else self.null_slot ================================================ FILE: jetson_voice/models/nlp/nlp_utils.py ================================================ #!/usr/bin/env python3 # coding: utf-8 import numpy as np # NLP BERT models (and BERT derivatives) have myelin problem with dynamic shapes on aarch64, # so we disable dynamic shape changing for now (shapes will be set to the max sequence length) nlp_dynamic_shapes=False def find_subtokens(encodings, method='char_span'): """ Compute the subtoken mask, where each token is marked as True if it's a subtoken or False otherwise. Longer words/acronyms may be tokenized into mulitple word pieces (called subtokens), for example: 'Yosemite' -> ['yo', '##se', '##mite'] 'U.S.' -> ['u', '.', 's', '.'] Parameters: encodings (BatchEncoding) -- Output from tokenizer method (string) -- If 'char_span', the subtoken mask will be determined by looking at the character indices. Tokens that map to characters that are side-by-side are flagged as subtokens. If 'subtoken_delimiters', subtokens will be identified by looking for '##' symbols. However this can miss punctuated subtokens, such as 'U.S.' Returns boolean subtoken mask array with shape (num_queries, num_tokens) """ num_queries = encodings['input_ids'].shape[0] subtoken_mask = [] if method == 'char_span': for query_idx in range(num_queries): mask = [] last_char = -1 tokens = encodings.tokens(query_idx) for token_idx, word_id in enumerate(encodings.word_ids(query_idx)): if word_id is None: # skip special tokens mask.append(False) continue chars = encodings.token_to_chars(query_idx, token_idx) if chars[0] == last_char: mask.append(True) else: mask.append(False) last_char = chars[1] subtoken_mask.append(mask) elif method == 'subtoken_delimiters': for query_idx in range(num_queries): subtoken_mask.append([token.startswith('##') for token in encodings.tokens(query_idx)]) else: raise ValueError(f"invalid method ('{method}')") return np.asarray(subtoken_mask) ================================================ FILE: jetson_voice/models/nlp/question_answer.py ================================================ #!/usr/bin/env python3 # coding: utf-8 import os import logging import numpy as np from transformers import AutoTokenizer from jetson_voice.nlp import QuestionAnswerService from jetson_voice.utils import load_model, normalize_logits from .nlp_utils import nlp_dynamic_shapes class QuestionAnswerEngine(QuestionAnswerService): """ Question answering model in TensorRT / onnxruntime. """ def __init__(self, config, *args, **kwargs): """ Load an question answering model from ONNX """ super(QuestionAnswerEngine, self).__init__(config, *args, **kwargs) if self.config.type != 'qa': raise ValueError(f"{self.config.model_path} isn't a Question Answering model (type '{self.config.type}'") # load model dynamic_shapes = {'max' : (1, self.config['dataset']['max_seq_length'])} # (batch_size, sequence_length) if nlp_dynamic_shapes: dynamic_shapes['min'] = (1, 1) self.model = load_model(self.config, dynamic_shapes) # create tokenizer self.tokenizer = AutoTokenizer.from_pretrained(self.config['tokenizer']['tokenizer_name']) self.question_first = bool(self.tokenizer.padding_side == "right") def __call__(self, query, top_k=1): """ Perform question/answering on the input query. Parameters: query (dict or tuple) -- Either a dict with 'question' and 'context' keys, or a (question, context) tuple. top_k (int) -- How many of the top results to return, sorted by score. The default (top_k=1) is to return just the top result. If top_k > 1, then a list of results will be returned. Returns: dict(s) with the following keys: 'answer' (string) -- the answer text 'score' (float) -- the probability [0,1] 'start' (int) -- the starting character index of the answer into the context text 'end' (int) -- the ending character index of the answer into the context text If top_k > 1, a list of dicts with the top_k results will be returned. If top_k == 1, just the single dict with the top score will be returned. """ if isinstance(query, dict): question = query['question'] context = query['context'] elif isinstance(query, tuple): question = query[0] context = query[1] else: raise ValueError(f'query must be a dict or tuple (instead was type {type(query).__name__})') # check for models that have a doc_stride >= max_seq_length # this will cause an exception in the tokenizer doc_stride = self.config['dataset']['doc_stride'] max_seq_len = self.config['dataset']['max_seq_length'] if doc_stride >= max_seq_len: doc_stride = int(max_seq_len/2) # tokenize the inputs encodings = self.tokenizer( text=question if self.question_first else context, text_pair=context if self.question_first else question_text, padding='longest' if nlp_dynamic_shapes else 'max_length', truncation="only_second" if self.question_first else "only_first", max_length=max_seq_len, stride=doc_stride, return_tensors='np', return_token_type_ids=True, return_overflowing_tokens=True, return_offsets_mapping=True, return_special_tokens_mask=True, ) # When the input is too long, it's converted in a batch of inputs with overflowing tokens # and a stride of overlap between the inputs. If a batch of inputs is given, a special output # "overflow_to_sample_mapping" indicate which member of the encoded batch belong to which original batch sample. # Here we tokenize examples one-by-one so we don't need to use "overflow_to_sample_mapping". # "num_span" is the number of output samples generated from the overflowing tokens. num_spans = len(encodings["input_ids"]) logging.debug(f'num_spans: {num_spans}') # p_mask: mask with 1 for token than cannot be in the answer (0 for token which can be in an answer) # We put 0 on the tokens from the context and 1 everywhere else (question and special tokens) p_mask = np.asarray( [ [tok != 1 if self.question_first else 0 for tok in encodings.sequence_ids(span_id)] for span_id in range(num_spans) ] ) # keep the cls_token unmasked (some models use it to indicate unanswerable questions) if self.tokenizer.cls_token_id is not None: cls_index = np.nonzero(encodings["input_ids"] == self.tokenizer.cls_token_id) p_mask[cls_index] = 0 # run the model over each span (TODO batching) model_outputs = [] for span_idx in range(num_spans): inputs = {} for input in self.model.inputs: if input.name not in encodings: raise ValueError(f"the encoded inputs from the tokenizer doesn't contain '{input.name}'") inputs[input.name] = np.expand_dims(encodings[input.name][span_idx], axis=0) # add batch dim model_outputs.append(self.model.execute(inputs)) # post-processing answers = [] min_null_score = 1000000 handle_impossible_answer = self.config['dataset']['version_2_with_negative'] for span_idx in range(num_spans): start_logits = np.squeeze(model_outputs[span_idx][:,:,0]) end_logits = np.squeeze(model_outputs[span_idx][:,:,1]) # Ensure padded tokens & question tokens cannot belong to the set of candidate answers. undesired_tokens = np.abs(p_mask[span_idx] - 1) & encodings['attention_mask'][span_idx] # Generate mask undesired_tokens_mask = (undesired_tokens == 0.0) # Make sure non-context indexes in the tensor cannot contribute to the softmax start_logits = np.where(undesired_tokens_mask, -10000.0, start_logits) end_logits = np.where(undesired_tokens_mask, -10000.0, end_logits) # Normalize logits and spans to retrieve the answer start_logits = np.exp(start_logits - np.log(np.sum(np.exp(start_logits), axis=-1, keepdims=True))) end_logits = np.exp(end_logits - np.log(np.sum(np.exp(end_logits), axis=-1, keepdims=True))) if handle_impossible_answer: min_null_score = min(min_null_score, (start_logits[0] * end_logits[0]).item()) # Mask CLS start_logits[0] = end_logits[0] = 0.0 # Decode token probabilities starts, ends, scores = self.decode(start_logits, end_logits, top_k=top_k) if self.tokenizer.is_fast: # Convert the answer (tokens) back to the original text # Score: score from the model # Start: Index of the first character of the answer in the context string # End: Index of the character following the last character of the answer in the context string # Answer: Plain text of the answer enc = encodings[span_idx] # Sometimes the max probability token is in the middle of a word so: # - we start by finding the right word containing the token with `token_to_word` # - then we convert this word in a character span with `word_to_chars` for s, e, score in zip(starts, ends, scores): start = enc.word_to_chars(enc.token_to_word(s), sequence_index=1 if self.question_first else 0)[0] end = enc.word_to_chars(enc.token_to_word(e), sequence_index=1 if self.question_first else 0)[1] answers.append({ 'answer' : context[start : end], 'score' : score.item(), 'start' : start, 'end' : end }) else: raise NotImplementedError('QA post-processing is only implemented for fast tokenizers') if handle_impossible_answer: answers.append({'answer': '', 'score': min_null_score, 'start': 0, 'end': 0}) answers = sorted(answers, key=lambda x: x['score'], reverse=True)[:top_k] if top_k == 1: return answers[0] else: return answers def decode(self, start: np.ndarray, end: np.ndarray, top_k: int): """ Take the QA model output and will generate probabilities for each span to be the actual answer. In addition, it filters out some unwanted/impossible cases like answer len being greater than max_answer_len or answer end position being before the starting position. The method supports output the k-best answer through the top_k argument. Args: start (:obj:`np.ndarray`): Individual start probabilities for each token. end (:obj:`np.ndarray`): Individual end probabilities for each token. top_k (:obj:`int`): Indicates how many possible answer span(s) to extract from the model output. max_answer_len (:obj:`int`): Maximum size of the answer to extract from the model's output. """ # Ensure we have batch axis if start.ndim == 1: start = start[None] if end.ndim == 1: end = end[None] # Compute the score of each tuple(start, end) to be the real answer outer = np.matmul(np.expand_dims(start, -1), np.expand_dims(end, 1)) # Remove candidate with end < start and end - start > max_answer_len candidates = np.tril(np.triu(outer), self.config['dataset']['max_answer_length'] - 1) # Inspired by Chen & al. (https://github.com/facebookresearch/DrQA) scores_flat = candidates.flatten() if top_k == 1: idx_sort = [np.argmax(scores_flat)] elif len(scores_flat) < top_k: idx_sort = np.argsort(-scores_flat) else: idx = np.argpartition(-scores_flat, top_k)[0:top_k] idx_sort = idx[np.argsort(-scores_flat[idx])] start, end = np.unravel_index(idx_sort, candidates.shape)[1:] return start, end, candidates[0, start, end] ================================================ FILE: jetson_voice/models/nlp/text_classification.py ================================================ #!/usr/bin/env python3 # coding: utf-8 import os import logging import numpy as np from transformers import AutoTokenizer from jetson_voice.nlp import TextClassificationService from jetson_voice.utils import load_model, normalize_logits from .nlp_utils import nlp_dynamic_shapes class TextClassificationEngine(TextClassificationService): """ Text classification model in TensorRT / onnxruntime. """ def __init__(self, config, *args, **kwargs): """ Load an text classification model from ONNX """ super(TextClassificationEngine, self).__init__(config, *args, **kwargs) if self.config.type != 'text_classification': raise ValueError(f"{self.config.model_path} isn't a Text Classification model (type '{self.config.type}'") # load model dynamic_shapes = {'max' : (1, self.config['dataset']['max_seq_length'])} # (batch_size, sequence_length) if nlp_dynamic_shapes: dynamic_shapes['min'] = (1, 1) self.model = load_model(self.config, dynamic_shapes) # create tokenizer self.tokenizer = AutoTokenizer.from_pretrained(self.config['tokenizer']['tokenizer_name']) def __call__(self, query): """ Perform text classification on the input query. Parameters: query (string) -- The text query, for example: 'Today was warm, sunny and beautiful out.' Returns a dict with the following keys: 'class' (int) -- the predicted class index 'label' (string) -- the predicted class label (and if there aren't labels `str(class)`) 'score' (float) -- the classification probability [0,1] """ encodings = self.tokenizer( text=query, padding='longest' if nlp_dynamic_shapes else 'max_length', truncation=True, max_length=self.config['dataset']['max_seq_length'], return_tensors='np', return_token_type_ids=True, return_overflowing_tokens=True, return_offsets_mapping=True, return_special_tokens_mask=True, ) # retrieve the inputs from the encoded tokens inputs = {} for input in self.model.inputs: if input.name not in encodings: raise ValueError(f"the encoded inputs from the tokenizer doesn't contain '{input.name}'") inputs[input.name] = encodings[input.name] # run the model logits = self.model.execute(inputs) logits = normalize_logits(logits) preds = np.argmax(logits, axis=-1) # tabulate results results = [] for query_idx in range(preds.shape[0]): results.append({ 'class' : int(preds[query_idx]), 'label' : str(preds[query_idx]), 'score' : logits[query_idx][preds[query_idx]] }) if len(results) == 1: return results[0] else: return results ================================================ FILE: jetson_voice/models/nlp/token_classification.py ================================================ #!/usr/bin/env python3 # coding: utf-8 import os import logging import numpy as np from transformers import AutoTokenizer from jetson_voice.nlp import TokenClassificationService from jetson_voice.utils import load_model, normalize_logits from .nlp_utils import find_subtokens, nlp_dynamic_shapes class TokenClassificationEngine(TokenClassificationService): """ Token classification model (aka Named Entity Recognition) in TensorRT / onnxruntime. """ def __init__(self, config, *args, **kwargs): """ Load an token classification model for NER from ONNX """ super(TokenClassificationEngine, self).__init__(config, *args, **kwargs) if self.config.type != 'token_classification': raise ValueError(f"{self.config.model_path} isn't a Token Classification model (type '{self.config.type}'") # load model dynamic_shapes = {'max' : (1, self.config['dataset']['max_seq_length'])} # (batch_size, sequence_length) if nlp_dynamic_shapes: dynamic_shapes['min'] = (1, 1) self.model = load_model(self.config, dynamic_shapes) # create tokenizer self.tokenizer = AutoTokenizer.from_pretrained(self.config['tokenizer']['tokenizer_name']) def __call__(self, query): """ Perform token classification (NER) on the input query and return tagged entities. Parameters: query (string) -- The text query, for example: "Ben is from Chicago, a city in the state of Illinois, US' Returns a list[dict] of tagged entities with the following dictionary keys: 'class' (int) -- the entity class index 'label' (string) -- the entity class label 'score' (float) -- the classification probability [0,1] 'text' (string) -- the corresponding text from the input query 'start' (int) -- the starting character index of the text 'end' (int) -- the ending character index of the text """ encodings = self.tokenizer( text=query, padding='longest' if nlp_dynamic_shapes else 'max_length', truncation=True, max_length=self.config['dataset']['max_seq_length'], return_tensors='np', return_token_type_ids=True, return_overflowing_tokens=True, return_offsets_mapping=True, return_special_tokens_mask=True, ) # during token classification, we want to ignore slots from subtokens and special tokens subtoken_mask = find_subtokens(encodings) ignore_mask = subtoken_mask | encodings['special_tokens_mask'] # retrieve the inputs from the encoded tokens inputs = {} for input in self.model.inputs: if input.name not in encodings: raise ValueError(f"the encoded inputs from the tokenizer doesn't contain '{input.name}'") inputs[input.name] = encodings[input.name] # run the model logits = self.model.execute(inputs) logits = normalize_logits(logits) preds = np.argmax(logits, axis=-1) probs = np.amax(logits, axis=-1) # tabulate results tags = [] label_map = {v: k for k, v in self.config['label_ids'].items()} num_queries, num_tokens, _ = logits.shape assert num_queries == 1 # there should only be 1 input query currently for query_idx in range(num_queries): query_tags = [] for token_idx in range(num_tokens): label = label_map[preds[query_idx][token_idx]] # ignore unclassified slots or masked tokens if label == self.config['dataset']['pad_label'] or ignore_mask[query_idx][token_idx]: continue # convert from token index back to the query string chars = encodings.token_to_chars(query_idx, token_idx) # append subtokens from the query to the text for subtoken_idx in range(token_idx+1, num_tokens): if subtoken_mask[query_idx][subtoken_idx]: chars = (chars[0], encodings.token_to_chars(query_idx, subtoken_idx)[1]) else: break text = query[chars[0]:chars[1]] # queries[query_idx] # strip out punctuation to attach the entity tag to the word not to a punctuation mark if not text[-1].isalpha(): text = text[:-1] chars = (chars[0], chars[1]-1) query_tags.append({ 'label' : label, 'class' : preds[query_idx][token_idx], 'score' : probs[query_idx][token_idx], 'text' : text, 'start' : chars[0], 'end' : chars[1] }) tags.append(query_tags) if len(tags) == 1: return tags[0] else: return tags ================================================ FILE: jetson_voice/models/tts/__init__.py ================================================ #!/usr/bin/env python3 # coding: utf-8 from .tts_engine import TTSEngine ================================================ FILE: jetson_voice/models/tts/tts_engine.py ================================================ #!/usr/bin/env python3 # coding: utf-8 import os import re import logging import inflect import numpy as np from jetson_voice.tts import TTSService from jetson_voice.utils import global_config, load_model, softmax class TTSEngine(TTSService): """ Text-to-speech synthesis. This is actually a pipeline of two models, the generator model (which generates MEL spectrograms from tokens), and the vocoder (which outputs audio from MEL spectrograms) """ def __init__(self, config, *args, **kwargs): """ Loads a streaming ASR model from ONNX or serialized TensorRT engine. Parameters: model (string) -- path to ONNX model or serialized TensorRT engine/plan config (string) -- path to model configuration json (will be inferred from model path if empty) """ super(TTSEngine, self).__init__(config, *args, **kwargs) if self.config.type != 'tts': raise ValueError(f"{self.config.model_path} isn't a Text-to-Speech model (type '{self.config.type}'") # load text->MEL generator model self.generator = load_model(self.config.generator) # load MEL->audio vocoder model features = self.config.vocoder.features dynamic_shapes = { 'min' : (1, features, 1), 'opt' : (1, features, 160), # ~5-6 words 'max' : (1, features, 1024) # ~20-30 words? } self.vocoder = load_model(self.config.vocoder, dynamic_shapes=dynamic_shapes) # create map of symbol->ID embeddings self.symbol_to_id = {s: i for i, s in enumerate(self.get_symbols())} # create operators for num-to-word conversion self.number_regex = re.compile(r'\d+(?:,\d+)?') # https://stackoverflow.com/a/16321189 self.number_inflect = inflect.engine() def __call__(self, text): """ Generate audio from text. Parameters: text (string) -- The phrase to convert to audio. Returns audio samples in a numpy array. """ text = self.numbers_to_words(text) # vocab doesn't include numbers, so convert them to words pad_symbol = ' ' min_length = 6 if text[-1].isalnum(): # end with punctuation, otherwise audio is cut-off text += pad_symbol if len(text) < min_length: # WAR for cuDNN error on JetPack <= 4.5.x text = text.ljust(min_length, pad_symbol) # convert chars to symbol embeddings encoded_text = [self.symbol_to_id[s] for s in text.lower() if s in self.symbol_to_id] encoded_text = np.expand_dims(np.array(encoded_text, dtype=np.int64), axis=0) # generate MEL spectrogram + audio mels = self.generator.execute(encoded_text)[0] audio = self.vocoder.execute(mels) return audio.squeeze() def get_symbols(self): """ Return a list of all the accepted character symbols / embeddings """ _arpabet = [ 'AA', 'AA0', 'AA1', 'AA2', 'AE', 'AE0', 'AE1', 'AE2', 'AH', 'AH0', 'AH1', 'AH2', 'AO', 'AO0', 'AO1', 'AO2', 'AW', 'AW0', 'AW1', 'AW2', 'AY', 'AY0', 'AY1', 'AY2', 'B', 'CH', 'D', 'DH', 'EH', 'EH0', 'EH1', 'EH2', 'ER', 'ER0', 'ER1', 'ER2', 'EY', 'EY0', 'EY1', 'EY2', 'F', 'G', 'HH', 'IH', 'IH0', 'IH1', 'IH2', 'IY', 'IY0', 'IY1', 'IY2', 'JH', 'K', 'L', 'M', 'N', 'NG', 'OW', 'OW0', 'OW1', 'OW2', 'OY', 'OY0', 'OY1', 'OY2', 'P', 'R', 'S', 'SH', 'T', 'TH', 'UH', 'UH0', 'UH1', 'UH2', 'UW', 'UW0', 'UW1', 'UW2', 'V', 'W', 'Y', 'Z', 'ZH' ] _arpabet = ['@' + s for s in _arpabet] _pad = '_' _punctuation = '!\'(),.:;? ' _special = '-' _letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz' symbols = list(_pad + _special + _punctuation + _letters) + _arpabet return symbols def numbers_to_words(self, text): """ Convert instances of numbers to words in the text. For example: "The answer is 42" -> "The answer is forty two." """ number_tokens = self.number_regex.findall(text) for number_token in number_tokens: # TODO test/handle floating-point numbers word_text = self.number_inflect.number_to_words(number_token) num_begin = text.index(number_token) # insert the words back at the old location text = text[:num_begin] + word_text + text[num_begin + len(number_token):] return text @property def sample_rate(self): """ Get the output sample rate (e.g. 22050, 44100, ect) """ return self.config['vocoder']['sample_rate'] ================================================ FILE: jetson_voice/nlp.py ================================================ #!/usr/bin/env python3 # coding: utf-8 from jetson_voice.utils import load_resource def NLP(resource, *args, **kwargs): """ Factory for automatically loading NLP models or services. Returns an instance of: - IntentSlotService - QuestionAnswerService - TextClassificationService - TokenClassificationService """ from jetson_voice.auto import AutoModel return AutoModel(resource, domain='nlp', *args, **kwargs) def IntentSlot(resource, *args, **kwargs): """ Loads a NLP joint intent/slot classifier service or model. See the IntentSlotService class for the signature that implementations use. """ factory_map = { 'tensorrt' : 'jetson_voice.models.nlp.IntentSlotEngine', 'onnxruntime' : 'jetson_voice.models.nlp.IntentSlotEngine' } return load_resource(resource, factory_map, *args, **kwargs) class IntentSlotService(): """ Intent/slot classifier service base class. """ def __init__(self, config, *args, **kwargs): """ Create service instance. """ self.config = config def __call__(self, query): """ Perform intent/slot classification on the input query. Parameters: query (string) -- The text query, for example: 'What is the weather in San Francisco tomorrow?' Returns a dict with the following keys: 'intent' (string) -- the classified intent label 'score' (float) -- the intent probability [0,1] 'slots' (list[dict]) -- a list of dicts, where each dict has the following keys: 'slot' (string) -- the slot label 'text' (string) -- the slot text from the query 'score' (float) -- the slot probability [0,1] """ pass def QuestionAnswer(resource, *args, **kwargs): """ Loads a NLP question answering service or model. See the QuestionAnswerService class for the signature that implementations use. """ factory_map = { 'tensorrt' : 'jetson_voice.models.nlp.QuestionAnswerEngine', 'onnxruntime' : 'jetson_voice.models.nlp.QuestionAnswerEngine' } return load_resource(resource, factory_map, *args, **kwargs) class QuestionAnswerService(): """ Question answering service base class. """ def __init__(self, config, *args, **kwargs): """ Create service instance. """ self.config = config def __call__(self, query, top_k=1): """ Perform question/answering on the input query. Parameters: query (dict or tuple) -- Either a dict with 'question' and 'context' keys, or a (question, context) tuple. top_k (int) -- How many of the top results to return, sorted by score. The default (topk=1) is to return just the top result. If topk > 1, then a list of results will be returned. Returns: dict(s) with the following keys: 'answer' (string) -- the answer text 'score' (float) -- the probability [0,1] 'start' (int) -- the starting character index of the answer into the context text 'end' (int) -- the ending character index of the answer into the context text If top_k > 1, a list of dicts with the topk results will be returned. If top_k == 1, just the single dict with the top score will be returned. """ pass def TextClassification(resource, *args, **kwargs): """ Loads a NLP text classification service or model. See the TextClassificationService class for the signature that implementations use. """ factory_map = { 'tensorrt' : 'jetson_voice.models.nlp.TextClassificationEngine', 'onnxruntime' : 'jetson_voice.models.nlp.TextClassificationEngine' } return load_resource(resource, factory_map, *args, **kwargs) class TextClassificationService(): """ Text classification service base class. """ def __init__(self, config, *args, **kwargs): """ Create service instance. """ self.config = config def __call__(self, query): """ Perform text classification on the input query. Parameters: query (string) -- The text query, for example: 'Today was warm, sunny and beautiful out.' Returns a dict with the following keys: 'class' (int) -- the predicted class index 'label' (string) -- the predicted class label (and if there aren't labels `str(class)`) 'score' (float) -- the classification probability [0,1] """ pass def TokenClassification(resource, *args, **kwargs): """ Loads a NLP token classification (aka Named Entity Recognition) service or model. See the TokenClassificationService class for the signature that implementations use. """ factory_map = { 'tensorrt' : 'jetson_voice.models.nlp.TokenClassificationEngine', 'onnxruntime' : 'jetson_voice.models.nlp.TokenClassificationEngine' } return load_resource(resource, factory_map, *args, **kwargs) class TokenClassificationService(): """ Token classification (aka Named Entity Recognition) service base class. """ def __init__(self, config, *args, **kwargs): """ Create service instance. """ self.config = config def __call__(self, query): """ Perform token classification (NER) on the input query and return tagged entities. Parameters: query (string) -- The text query, for example: "Ben is from Chicago, a city in the state of Illinois, US' Returns a list[dict] of tagged entities with the following dictionary keys: 'class' (int) -- the entity class index 'label' (string) -- the entity class label 'score' (float) -- the classification probability [0,1] 'text' (string) -- the corresponding text from the input query 'start' (int) -- the starting character index of the text 'end' (int) -- the ending character index of the text """ pass @staticmethod def tag_string(query, tags, scores=False): """ Returns a string with the tags inserted inline with the query. For example: "Ben[B-PER] is from Chicago[B-LOC], a city in the state of Illinois[B-LOC], US[B-LOC]" Parameters: query (string) -- The original query string. tags (list[dict]) -- The tags predicted by the model. scores (bool) -- If true, the probabilities will be added inline. If false (default), only the tag labels will be added. """ char_offset = 0 for tag in tags: if scores: tag_str = f"[{tag['label']} {tag['score']:.3}]" else: tag_str = f"[{tag['label']}]" query = query[:tag['end'] + char_offset] + tag_str + query[tag['end'] + char_offset:] char_offset += len(tag_str) return query if __name__ == "__main__": from jetson_voice import ConfigArgParser import pprint parser = ConfigArgParser() parser.add_argument('--model', default='distilbert_intent', type=str) parser.add_argument('--type', default='intent_slot', type=str) args = parser.parse_args() args.type = args.type.lower() print(args) if args.type == 'intent_slot': model = IntentSlot(args.model) # create some test queries queries = [ 'Set alarm for Seven Thirty AM', 'Please increase the volume', 'What is my schedule for tomorrow', 'Place an order for a large pepperoni pizza from Dominos' ] # process the queries for query in queries: results = model(query) print('\n') print('query:', query) print('') pprint.pprint(results) elif args.type == 'question_answer' or args.type == 'qa': model = QuestionAnswer(args.model) # create some test queries queries = [] queries.append({ "question" : "What is the value of Pi?", "context" : "Some people have said that Pi is tasty but there should be a value for Pi, and the value for Pi is around 3.14. " "Pi is the ratio of a circle's circumference to it's diameter. The constant Pi was first calculated by Archimedes " "in ancient Greece around the year 250 BC." }) queries.append({ "question" : "Who discovered Pi?", "context" : queries[-1]['context'] }) queries.append({ "question" : "Which nation contains the majority of the Amazon forest?", "context" : "The Amazon rainforest is a moist broadleaf forest that covers most of the Amazon basin of South America. " "This basin encompasses 7,000,000 square kilometres (2,700,000 sq mi), of which 5,500,000 square kilometres " "(2,100,000 sq mi) are covered by the rainforest. The majority of the forest is contained within Brazil, " "with 60% of the rainforest, followed by Peru with 13%, and Colombia with 10%." }) queries.append({ "question" : "How large is the Amazon rainforest?", "context" : queries[-1]['context'] }) # process the queries for query in queries: answers = model(query, top_k=5) print('\n') print('context:', query['context']) print('') print('question:', query['question']) for answer in answers: print('') print('answer: ', answer['answer']) print('score: ', answer['score']) elif args.type == 'text_classification': model = TextClassification(args.model) # create some test queries (these are for sentiment models) queries = [ "By the end of no such thing the audience, like beatrice, has a watchful affection for the monster.", "Director Rob Marshall went out gunning to make a great one.", "Uneasy mishmash of styles and genres.", "I love exotic science fiction / fantasy movies but this one was very unpleasant to watch. I gave it 4 / 10 since some special effects were nice.", "Today was cold and rainy and not very nice.", "Today was warm, sunny and beautiful out.", ] # process the queries for query in queries: results = model(query) print('\nquery:', query) pprint.pprint(results) elif args.type == 'token_classification': model = TokenClassification(args.model) # create some test queries queries = [ "But candidate Charles Baker, who has about eight percent of the vote, has called for an investigation into reports of people voting multiple times.", "Analysts say Mr. Chung's comments may be part of efforts by South Korea to encourage North Korea to resume bilateral talks.", "The 63-year-old Daltrey walked offstage during the first song; guitarist Pete Townshend later told the crowd he was suffering from bronchitis and could barely speak.", "The Who is currently touring in support of Endless Wire, its first album since 1982.", "Meanwhile, Iowa is cleaning up after widespread flooding inundated homes, destroyed crops and cut off highways and bridges.", "At the White House Tuesday, U.S. President George Bush expressed concern for the flood victims.", "Ben is from Chicago, a city in the state of Illinois, US with a population of 2.7 million people.", "Lisa's favorite place to climb in the summer is El Capitan in Yosemite National Park in California, U.S." ] # process the queries for query in queries: tags = model(query) #print(f'\n{query}') #pprint.pprint(tags) print(f'\n{model.tag_string(query, tags, scores=True)}') else: raise ValueError(f"invalid --type argument ({args.type})") ================================================ FILE: jetson_voice/tts.py ================================================ #!/usr/bin/env python3 # coding: utf-8 from jetson_voice.utils import load_resource def TTS(resource, *args, **kwargs): """ Loads a TTS service or model. See the TTSService class for the signature that implementations use. """ factory_map = { 'riva' : 'jetson_voice.backends.riva.RivaTTSService', 'tensorrt' : 'jetson_voice.models.tts.TTSEngine', 'onnxruntime' : 'jetson_voice.models.tts.TTSEngine' } return load_resource(resource, factory_map, *args, **kwargs) class TTSService(): """ TTS service base class. """ def __init__(self, config, *args, **kwargs): """ Create service instance. """ self.config = config def __call__(self, text): """ Generate audio from text. Parameters: text (string) -- The phrase to convert to audio. Returns audio samples in a numpy array. """ pass @property def sample_rate(self): """ Get the output sample rate (in Hz) """ pass if __name__ == "__main__": from jetson_voice import list_audio_devices, ConfigArgParser from soundfile import SoundFile import pprint import pyaudio import time parser = ConfigArgParser() parser.add_argument('--model', default='fastpitch_hifigan', type=str) parser.add_argument('--text', default='Hello, how are you today?', type=str) parser.add_argument('--warmup', type=int, default=9, help='the number of warmup runs') parser.add_argument("--output-device", type=int, default=None, help='output audio device to use') parser.add_argument("--output-wav", type=str, default=None, help='output wav file to write to') parser.add_argument('--list-devices', action='store_true', help='list audio input devices') args = parser.parse_args() print(args) # list audio devices if args.list_devices: list_audio_devices() # load the model tts = TTS(args.model) # display the text print(f"\n'{args.text}'\n") # run the TTS for run in range(args.warmup+1): start = time.perf_counter() audio = tts(args.text) stop = time.perf_counter() latency = stop-start duration = audio.shape[0]/tts.sample_rate print(f"Run {run} -- Time to first audio: {latency:.3f}s. Generated {duration:.2f}s of audio. RTFx={duration/latency:.2f}.") # output the audio if args.output_device is not None: p = pyaudio.PyAudio() stream = p.open(output_device_index=args.output_device, format=pyaudio.paFloat32, channels=1, rate=tts.sample_rate, output=True) stream.write(audio.tobytes()) stream.close_stream() stream.close() if args.output_wav is not None: wav = SoundFile(args.output_wav, mode='w', samplerate=tts.sample_rate, channels=1) wav.write(audio) wav.close() print(f"Wrote audio to {args.output_wav}") ================================================ FILE: jetson_voice/utils/__init__.py ================================================ #!/usr/bin/env python3 # coding: utf-8 from .config import global_config, ConfigDict, ConfigArgParser from .resource import find_resource, load_resource, load_model, list_models from .audio import * from .softmax import softmax, normalize_logits ================================================ FILE: jetson_voice/utils/audio.py ================================================ #!/usr/bin/env python3 # coding: utf-8 import os import math import pprint import logging import librosa import soundfile import pyaudio as pa import numpy as np def audio_db(samples): """ Compute RMS of audio samples in dB. """ rms = librosa.feature.rms(y=samples, frame_length=samples.shape[0], center=False)[0][0] if rms != 0.0: return 20.0 * math.log10(rms) else: return -100.0 def audio_to_float(samples): """ Convert audio samples to 32-bit float in the range [-1,1] """ if samples.dtype == np.float32: return samples return samples.astype(np.float32) / 32768 def audio_to_int16(samples): """ Convert audio samples to 16-bit float in the range [-32767,32767] """ if samples.dtype == np.int16: return samples elif samples.dtype == np.float32: return (samples * 32768).astype(np.int16) else: return samples.astype(np.int16) def AudioInput(wav=None, mic=None, sample_rate=16000, chunk_size=16000): """ Create an audio input stream from wav file or microphone. Either the wav or mic argument needs to be specified. Parameters: wav (string) -- path to .wav file mic (int) -- microphone device index sample_rate (int) -- the desired sample rate in Hz chunk_size (int) -- the number of samples returned per next() iteration Returns AudioWavStream or AudioMicStream """ if mic is not None and mic != '': return AudioMicStream(mic, sample_rate=sample_rate, chunk_size=chunk_size) elif wav is not None and wav != '': return AudioWavStream(wav, sample_rate=sample_rate, chunk_size=chunk_size) else: raise ValueError('either wav or mic argument must be specified') class AudioWavStream: """ Audio playback stream from .wav file """ def __init__(self, filename, sample_rate, chunk_size): self.filename = filename self.chunk_size = chunk_size self.sample_rate = sample_rate if not os.path.isfile(filename): raise IOError(f'could not find file {filename}') logging.info(f"loading audio '{filename}'") self.samples, _ = librosa.load(filename, sr=sample_rate, mono=True) self.position = 0 def open(self): pass def close(self): pass def reset(self): self.position = 0 def next(self): if self.position >= len(self.samples): return None chunk = self.samples[self.position : min(self.position + self.chunk_size, len(self.samples))] if len(chunk) < self.chunk_size: chunk = np.pad(chunk, (0, self.chunk_size-len(chunk)), mode='constant') self.position += self.chunk_size return chunk def __next__(self): samples = self.next() if samples is None: raise StopIteration else: return samples def __iter__(self): self.position = 0 return self class AudioMicStream: """ Live audio stream from microphone input device. """ def __init__(self, device, sample_rate, chunk_size): self.stream = None self.interface = pa.PyAudio() self.device_info = find_audio_device(device, self.interface) self.device_id = self.device_info['index'] self.device_sample_rate = sample_rate self.device_chunk_size = chunk_size self.sample_rate = sample_rate self.chunk_size = chunk_size print('Audio Input Device:') pprint.pprint(self.device_info) def __del__(self): self.close() self.interface.terminate() def open(self): if self.stream: return sample_rates = [self.sample_rate, int(self.device_info['defaultSampleRate']), 16000, 22050, 32000, 44100] chunk_sizes = [] for sample_rate in sample_rates: chunk_sizes.append(int(self.chunk_size * sample_rate / self.sample_rate)) for sample_rate, chunk_size in zip(sample_rates, chunk_sizes): try: logging.info(f'trying to open audio input {self.device_id} with sample_rate={sample_rate} chunk_size={chunk_size}') self.stream = self.interface.open(format=pa.paInt16, channels=1, rate=sample_rate, input=True, input_device_index=self.device_id, frames_per_buffer=chunk_size) self.device_sample_rate = sample_rate self.device_chunk_size = chunk_size break except OSError as err: print(err) logging.warning(f'failed to open audio input {self.device_id} with sample_rate={sample_rate}') self.stream = None if self.stream is None: logging.error(f'failed to open audio input device {self.device_id} with any of these sample rates:') logging.error(str(sample_rates)) raise ValueError(f"audio input device {self.device_id} couldn't be opened or does not support any of the above sample rates") print(f"\naudio stream opened on device {self.device_id} ({self.device_info['name']})") print("you can begin speaking now... (press Ctrl+C to exit)\n") def close(self): if self.stream is not None: self.stream.stop_stream() self.stream.close() self.stream = None def reset(self): self.close() self.open() def next(self): self.open() samples = self.stream.read(self.device_chunk_size, exception_on_overflow=False) samples = np.frombuffer(samples, dtype=np.int16) if self.sample_rate != self.device_sample_rate: samples = audio_to_float(samples) samples = librosa.resample(samples, self.device_sample_rate, self.sample_rate) if len(samples) != self.chunk_size: logging.warning(f'resampled input audio has {len(samples)}, but expected {self.chunk_size} samples') return samples def __next__(self): samples = self.next() if samples is None: raise StopIteration else: return samples def __iter__(self): self.open() return self class AudioOutput: """ Audio output stream to a speaker. """ def __init__(self, device, sample_rate, chunk_size=4096): self.stream = None if device is None: self.device_id = None logging.warning(f"creating pass-through audio output without a device") return self.interface = pa.PyAudio() self.device_info = find_audio_device(device, self.interface) self.device_id = self.device_info['index'] self.chunk_size = chunk_size self.sample_rate = sample_rate self.requested_rate = sample_rate print('Audio Output Device:') pprint.pprint(self.device_info) self.open() def __del__(self): if self.device_id is None: return self.close() self.interface.terminate() def open(self): if self.stream or self.device_id is None: return try: self.stream = self.interface.open(format=pa.paFloat32, channels=1, rate=self.sample_rate, frames_per_buffer=self.chunk_size, output=True, output_device_index=self.device_id) except: self.sample_rate = int(self.device_info['defaultSampleRate']) logging.error(f"failed to open audio output device with sample_rate={self.requested_rate}, trying again with sample_rate={self.sample_rate}") self.stream = self.interface.open(format=pa.paFloat32, channels=1, rate=self.sample_rate, frames_per_buffer=self.chunk_size, output=True, output_device_index=self.device_id) logging.info(f"opened audio output device {self.device_id} ({self.device_info['name']})") def close(self): if self.stream is not None: self.stream.stop_stream() self.stream.close() self.stream = None def write(self, samples): if self.device_id is None: return self.open() samples = audio_to_float(samples) if self.requested_rate != self.sample_rate: samples = librosa.resample(samples, self.requested_rate, self.sample_rate) #wav = soundfile.SoundFile('data/audio/resample_test.wav', mode='w', samplerate=self.sample_rate, channels=1) #wav.write(samples) #wav.close() self.stream.write(samples.tobytes()) # # device enumeration # _audio_device_info = None def _get_audio_devices(audio_interface=None): global _audio_device_info if _audio_device_info: return _audio_device_info if audio_interface: interface = audio_interface else: interface = pa.PyAudio() info = interface.get_host_api_info_by_index(0) numDevices = info.get('deviceCount') _audio_device_info = [] for i in range(0, numDevices): _audio_device_info.append(interface.get_device_info_by_host_api_device_index(0, i)) if not audio_interface: interface.terminate() return _audio_device_info def find_audio_device(device, audio_interface=None): """ Find an audio device by it's name or ID number. """ devices = _get_audio_devices(audio_interface) try: device_id = int(device) except ValueError: if not isinstance(device, str): raise ValueError("expected either a string or an int for 'device' parameter") found = False for id, dev in enumerate(devices): if device.lower() == dev['name'].lower(): device_id = id found = True break if not found: raise ValueError(f"could not find audio device with name '{device}'") if device_id < 0 or device_id >= len(devices): raise ValueError(f"invalid audio device ID ({device_id})") return devices[device_id] def list_audio_inputs(): """ Print out information about present audio input devices. """ devices = _get_audio_devices() print('') print('----------------------------------------------------') print(f" Audio Input Devices") print('----------------------------------------------------') for i, dev_info in enumerate(devices): if (dev_info.get('maxInputChannels')) > 0: print("Input Device ID {:d} - '{:s}' (inputs={:.0f}) (sample_rate={:.0f})".format(i, dev_info.get('name'), dev_info.get('maxInputChannels'), dev_info.get('defaultSampleRate'))) print('') def list_audio_outputs(): """ Print out information about present audio output devices. """ devices = _get_audio_devices() print('') print('----------------------------------------------------') print(f" Audio Output Devices") print('----------------------------------------------------') for i, dev_info in enumerate(devices): if (dev_info.get('maxOutputChannels')) > 0: print("Output Device ID {:d} - '{:s}' (outputs={:.0f}) (sample_rate={:.0f})".format(i, dev_info.get('name'), dev_info.get('maxOutputChannels'), dev_info.get('defaultSampleRate'))) print('') def list_audio_devices(): """ Print out information about present audio input and output devices. """ list_audio_inputs() list_audio_outputs() ================================================ FILE: jetson_voice/utils/config.py ================================================ #!/usr/bin/env python3 # coding: utf-8 import os import json import pprint import logging import argparse # # Default global configuration # # This can be overriden at runtime with command-line options (see ConfigArgParser) # such as --global-config to load your own configuration from json file, # or by calling config.load('my_config.json') # # You can also set the options directly on the 'config' object, e.g. # # config.model_dir = '/path/to/my/models' # config.log_level = 'warning' # # It's recommended to use one of the methods above instead of changing _default_config directly. # _default_global_config = { 'version' : 0.1, 'model_dir' : '/jetson-voice/data/networks', 'model_manifest' : '/jetson-voice/data/networks/manifest.json', 'default_backend' : 'tensorrt', 'log_level' : 'info', 'debug' : False, 'profile' : False } class ConfigDict(dict): """ Configuration dict that can be loaded from JSON and has members accessible via attributes and can watch for updates to keys. """ def __init__(self, *args, path=None, watch=None, **kwargs): """ Parameters: path (str) -- Path to JSON file to load from watch (function or dict) -- A callback function that gets called when a key is set. Should a function signature like my_watch(key, value) This can also be a dict of key names and functions, and each function will only be called when it's particular key has been set. You can also subclass ConfigDict and override the __watch__() member function. """ super(ConfigDict, self).__init__(*args, **kwargs) self.__dict__['path'] = path self.__dict__['watch'] = watch for x in args: if isinstance(x, dict): for y in x: self.__watch__(y, x[y]) for x in kwargs: self.__watch__(x, kwargs[x]) if path: self.load(path) def load(self, path, clear=False): """ Load from JSON file. """ from .resource import find_resource # import here to avoid circular dependency path = find_resource(path) self.__dict__['path'] = path if clear: self.clear() with open(path) as file: config_dict = json.load(file) self.update(config_dict) def __getattr__(self, attr): if attr in self.__dict__: return self.__dict__[attr] else: return self[attr] def __setattr__(self, attr, value): if attr in self.__dict__: self.__dict__[attr] = value else: self[attr] = value def __setitem__(self, key, value): if isinstance(value, dict): value = ConfigDict(value, watch=self.watch) value.__dict__['path'] = self.path super(ConfigDict, self).__setitem__(key, value) self.__watch__(key, value) def __watch__(self, key, value): #print(f'watch {key} -> {value}') if not self.watch: return if isinstance(self.watch, dict): if key in self.watch: self.watch[key](key, value) else: self.watch(key, value) def __str__(self): return pprint.pformat(self) #def __repr__(self): # return pprint.saferepr(self) def setdefault(self, key, default=None): if isinstance(default, dict): value = ConfigDict(value, watch=self.watch) value.__dict__['path'] = self.path changed = key not in self value = super(ConfigDict, self).setdefault(key, default) if changed: self.__watch__(key, value) def update(self, *args, **kwargs): for k, v in dict(*args, **kwargs).items(): self[k] = v # # logging handlers # logging.basicConfig(format='[%(asctime)s] %(filename)s:%(lineno)d - %(message)s', datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO) global_config = None def _set_log_level(key, value): log_value = value.upper() if log_value == 'VERBOSE': log_value = 'DEBUG' log_level = getattr(logging, log_value, None) if not isinstance(log_level, int): raise ValueError(f'Invalid log level: {value}') logging.getLogger().setLevel(log_level) logging.debug(f'set logging level to {value}') if global_config is not None and value.upper() == 'DEBUG': global_config['debug'] = True # # global config definition # global_config = ConfigDict(_default_global_config, watch={'log_level':_set_log_level}) if global_config.log_level.upper() == 'DEBUG': global_config['debug'] = True logging.debug(f'global config:\n{global_config}') # # custom arg parser # class ConfigArgParser(argparse.ArgumentParser): """ ArgumentParser that provides global configuration options. """ def __init__(self, *args, **kwargs): super(ConfigArgParser, self).__init__(*args, **kwargs) self.add_argument('--global-config', default=None, type=str, help='path to JSON file to load global configuration from') self.add_argument('--model-dir', default=_default_global_config['model_dir'], help=f"sets the root path of the models (default '{_default_global_config['model_dir']}')") self.add_argument('--model-manifest', default=_default_global_config['model_manifest'], help=f"sets the path to the model manifest file (default '{_default_global_config['model_manifest']}')") self.add_argument('--list-models', action='store_true', help='lists the available models (from $model_dir/manifest.json)') self.add_argument('--default-backend', default=_default_global_config['default_backend'], help=f"sets the default backend to use for model execution (default '{_default_global_config['default_backend']}')") self.add_argument('--profile', action='store_true', help='enables model performance profiling') self.add_argument('--verbose', action='store_true', help='sets the logging level to verbose') self.add_argument('--debug', action='store_true', help='sets the logging level to debug') log_levels = ['debug', 'verbose', 'info', 'warning', 'error', 'critical'] self.add_argument('--log-level', default=_default_global_config['log_level'], type=str, choices=log_levels, help=f"sets the logging level to one of the options above (default={_default_global_config['log_level']})") def parse_args(self, *args, **kwargs): args = super(ConfigArgParser, self).parse_args(*args, **kwargs) global_config.log_level = args.log_level global_config.model_dir = args.model_dir global_config.model_manifest = args.model_manifest global_config.default_backend = args.default_backend if args.profile: global_config.profile = True if args.verbose: global_config.log_level = 'verbose' if args.debug: global_config.log_level = 'debug' if args.global_config: global_config.load(args.global_config) if args.list_models: from .resource import list_models list_models() logging.debug(f'global config:\n{global_config}') return args ================================================ FILE: jetson_voice/utils/resource.py ================================================ #!/usr/bin/env python3 # coding: utf-8 import os import json import time import tqdm import pprint import logging import tarfile import urllib import importlib from .config import global_config, ConfigDict def find_resource(path): """ Find a resource by checking some common paths. """ if os.path.exists(path): return path search_dirs = [global_config.model_dir, os.path.join(global_config.model_dir, 'asr'), os.path.join(global_config.model_dir, 'nlp'), os.path.join(global_config.model_dir, 'tts')] for search_dir in search_dirs: search_path = os.path.join(search_dir, path) if os.path.exists(search_path): return search_path raise IOError(f"failed to locate resource '{path}'") def load_resource(resource, factory_map, *args, **kwargs): """ Load an instance of a resource from a config or service name. The factory_map dict maps the backend names to class names. Returns the resource instance, or the config if factory_map is null. """ if isinstance(resource, str): root, ext = os.path.splitext(resource) if len(ext) > 0: ext = ext.lower() if ext == '.json': config = ConfigDict(path=resource) elif ext == '.onnx' or ext == '.engine' or ext == '.plan': config = ConfigDict(path=root + '.json') else: raise ValueError(f"resource '{resource}' has invalid extension '{ext}'") else: manifest = download_model(resource) if manifest['type'] == 'model': config = ConfigDict(path=get_model_config_path(manifest=manifest)) else: config = ConfigDict(backend=manifest['backend'], type=manifest['name']) elif isinstance(resource, ConfigDict): config = resource elif isinstance(resource, dict): config = ConfigDict(resource) else: raise ValueError(f"expected string or dict type, instead got {type(resource).__name__}") config.setdefault('backend', global_config.default_backend) if factory_map is None: return config if config.backend not in factory_map: raise ValueError(f"'{config.path}' has invalid backend '{config.backend}' (valid options are: {', '.join(factory_map.keys())})") class_name = factory_map[config.backend].rsplit(".", 1) class_type = getattr(importlib.import_module(class_name[0]), class_name[1]) logging.debug(f"creating instance of {factory_map[config.backend]} for '{config.path}' (backend {config.backend})") logging.debug(class_type) return class_type(config, *args, **kwargs) def load_model(config, dynamic_shapes=None): """ Loads an ONNX model through a backend (either TensorRT or onnxruntime) """ factory_map = { 'tensorrt' : 'jetson_voice.backends.tensorrt.TRTModel', 'onnxruntime' : 'jetson_voice.backends.onnxruntime.OnnxRuntimeModel' } config.setdefault('backend', global_config.default_backend) config.setdefault('model_path', os.path.splitext(config.path)[0] + '.onnx') if not os.path.exists(config.model_path): model_path = os.path.join(os.path.dirname(config.path), config.model_path) if not os.path.exists(model_path): raise IOError(f"couldn't find file '{config.model_path}'") else: config.model_path = model_path if config.backend not in factory_map: raise ValueError(f"'{config.path}' has invalid backend '{config.backend}' (valid options are: {', '.join(factory_map.keys())})") class_name = factory_map[config.backend].rsplit(".", 1) class_type = getattr(importlib.import_module(class_name[0]), class_name[1]) logging.info(f"loading model '{config.model_path}' with {factory_map[config.backend]}") logging.debug(class_type) return class_type(config, dynamic_shapes=dynamic_shapes) def load_models_manifest(path=None): """ Load the models manifest file. If the path isn't overriden, it will use the default 'data/networks/manifest.json' """ if path is None: path = global_config.model_manifest with open(path) as file: manifest = json.load(file) for key in manifest: manifest[key].setdefault('name', key) manifest[key].setdefault('config', key + '.json') manifest[key].setdefault('type', 'model') return manifest def find_model_manifest(name): """ Find a model manifest entry by name / alias. """ manifest = load_models_manifest() for key in manifest: if key.lower() == name.lower(): return manifest[key] if 'alias' in manifest[key]: if isinstance(manifest[key]['alias'], str): aliases = [manifest[key]['alias']] else: aliases = manifest[key]['alias'] for alias in aliases: if alias.lower() == name.lower(): return manifest[key] raise ValueError(f"could not find '{name}' in manifest '{global_config.model_manifest}'") def download_model(name, max_attempts=10, retry_time=5): """ Download a model if it hasn't already been downloaded. """ manifest = find_model_manifest(name) if manifest is None: return None if manifest['type'] != 'model': return manifest if os.path.exists(get_model_config_path(manifest=manifest)): return manifest class DownloadProgressBar(tqdm.tqdm): def update_to(self, b=1, bsize=1, tsize=None): if tsize is not None: self.total = tsize self.update(b * bsize - self.n) def attempt_download(attempt): logging.info(f"downloading '{manifest['name']}' from {manifest['url']} (attempt {attempt} of {max_attempts})") with DownloadProgressBar(unit='B', unit_scale=True, miniters=1, desc=manifest['name']) as t: try: filename, _ = urllib.request.urlretrieve(manifest['url'], reporthook=t.update_to) except Exception as error: t.close() logging.error(error) return None return filename for attempt in range(1, max_attempts+1): filename = attempt_download(attempt) if filename is not None: break logging.error(f"failed to download '{manifest['name']}' from {manifest['url']} (attempt {attempt} of {max_attempts})") if attempt == max_attempts: raise ValueError(f"failed to download '{manifest['name']}' from {manifest['url']} (max attempts exceeded)") logging.info(f"waiting {retry_time} seconds before trying again...") time.sleep(retry_time) logging.info(f"extracting {filename} to {os.path.join(global_config.model_dir, manifest['domain'], manifest['name'])}") with tarfile.open(filename, "r:gz") as tar: tar.list() tar.extractall(path=os.path.join(global_config.model_dir, manifest['domain'])) os.remove(filename) return manifest def get_model_config_path(name=None, manifest=None): """ Gets the path to the model config from it's name or manifest entry. """ if name is None and manifest is None: raise ValueError('must specify either name or manifest arguments') if manifest is None: manifest = find_model_manifest(name) if manifest['type'] != 'model': raise ValueError(f"resource '{manifest['name']}' is not a model (type='{manifest['type']}')") if len(os.path.dirname(manifest['config'])) > 0: # if full path is specified return os.path.join(global_config.model_dir, manifest['domain'], manifest['config']) else: return os.path.join(global_config.model_dir, manifest['domain'], manifest['name'], manifest['config']) def list_models(): """ Print out the models available. """ manifest = load_models_manifest() print('') print('----------------------------------------------------') print(f" Models") print('----------------------------------------------------') for key in list(manifest): if manifest[key]['type'] != 'model': manifest.pop(key) pprint.pprint(manifest) print('') ================================================ FILE: jetson_voice/utils/softmax.py ================================================ #!/usr/bin/env python3 # coding: utf-8 import numpy as np def softmax(x, theta=1.0, axis=None): """ Compute the softmax of each element along an axis of x. Parameters ---------- x: ND-Array. Probably should be floats. theta (optional): float parameter, used as a multiplier prior to exponentiation. Default = 1.0 axis (optional): axis to compute values along. Default is the first non-singleton axis. Returns an array the same size as X. The result will sum to 1 along the specified axis. """ y = np.atleast_2d(x) # find axis if axis is None: axis = next(j[0] for j in enumerate(y.shape) if j[1] > 1) # multiply y against the theta parameter, y = y * float(theta) # subtract the max for numerical stability y = y - np.expand_dims(np.max(y, axis = axis), axis) # exponentiate y y = np.exp(y) # take the sum along the specified axis ax_sum = np.expand_dims(np.sum(y, axis = axis), axis) # finally: divide elementwise p = y / ax_sum # flatten if X was 1D if len(x.shape) == 1: p = p.flatten() return p def normalize_logits(logits): """ Normalize logits such that they are distributed between [0,1] """ return np.exp(logits - np.log(np.sum(np.exp(logits), axis=-1, keepdims=True))) ================================================ FILE: patches/nemo/1.0.0rc1/exportable.original.py ================================================ # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import os from abc import ABC from collections import defaultdict from enum import Enum from typing import Dict import onnx import torch from nemo.core.classes import typecheck from nemo.core.neural_types import AxisKind, NeuralType from nemo.utils import logging from nemo.utils.export_utils import replace_for_export try: import onnx_graphsurgeon as gs ONNX_GRAPHSURGEON_AVAILABLE = True except (ImportError, ModuleNotFoundError): ONNX_GRAPHSURGEON_AVAILABLE = False __all__ = ['ExportFormat', 'Exportable'] class ExportFormat(Enum): """Which format to use when exporting a Neural Module for deployment""" ONNX = (1,) TORCHSCRIPT = (2,) _EXT_DICT = { ".pt": ExportFormat.TORCHSCRIPT, ".onnx": ExportFormat.ONNX, } class Exportable(ABC): """ This Interface should be implemented by particular classes derived from nemo.core.NeuralModule or nemo.core.ModelPT. It gives these entities ability to be exported for deployment to formats such as ONNX. """ @staticmethod def get_format(filename: str): _, ext = os.path.splitext(filename) try: return _EXT_DICT[ext] except KeyError: raise ValueError(f"Export file {filename} extension does not correspond to any export format!") @property def input_module(self): return self @property def output_module(self): return self def get_input_names(self, input_example): if isinstance(input_example, Dict): input_names = list(input_example.keys()) else: if not (hasattr(self, 'input_types')): raise NotImplementedError( 'For export to work you must define input_types or pass names in input_example' ) input_names = list(self.input_types.keys()) # remove unnecessary inputs for input_ports for name in self.disabled_deployment_input_names: input_names.remove(name) return input_names def get_output_names(self, output_example): if isinstance(output_example, Dict): output_names = list(output_example.keys()) else: if not (hasattr(self, 'output_types')): raise NotImplementedError( 'For export to work you must define output_types or pass names in output_example' ) output_names = list(self.output_types.keys()) # remove unnecessary inputs for input_ports for name in self.disabled_deployment_output_names: output_names.remove(name) return output_names def get_input_dynamic_axes(self, input_names): dynamic_axes = defaultdict(list) for name in input_names: dynamic_axes = { **dynamic_axes, **self._extract_dynamic_axes(name, self.input_types[name]), } return dynamic_axes def get_output_dynamic_axes(self, output_names): dynamic_axes = defaultdict(list) for name in output_names: dynamic_axes = { **dynamic_axes, **self._extract_dynamic_axes(name, self.output_types[name]), } return dynamic_axes def export( self, output: str, input_example=None, output_example=None, verbose=False, export_params=True, do_constant_folding=True, keep_initializers_as_inputs=False, onnx_opset_version: int = 12, try_script: bool = False, set_eval: bool = True, check_trace: bool = True, use_dynamic_axes: bool = True, dynamic_axes=None, check_tolerance=0.01, forward_method=None, ): qual_name = self.__module__ + '.' + self.__class__.__qualname__ output_descr = qual_name + ' exported to ONNX' exported = ([output], [output_descr]) try: # Disable typechecks typecheck.set_typecheck_enabled(enabled=False) # Allow user to completely override forward method to export if forward_method is None and hasattr(type(self), "forward_for_export"): forward_method = type(self).forward_for_export if forward_method: old_forward_method = type(self).forward type(self).forward = forward_method # Set module to eval mode if set_eval: self.eval() format = self.get_format(output) self._prepare_for_export() with torch.jit.optimized_execution(True): jitted_model = None if try_script: try: jitted_model = torch.jit.script(self) except Exception as e: print("jit.script() failed!", e) if input_example is None: input_example = self.input_module.input_example() with torch.jit.optimized_execution(True): if format == ExportFormat.TORCHSCRIPT: if isinstance(input_example, Dict): input_example = tuple(input_example.values()) if jitted_model is None: jitted_model = torch.jit.trace( self, input_example, strict=False, optimize=True, check_trace=check_trace, check_tolerance=check_tolerance, ) jitted_model.save(output) assert os.path.exists(output) elif format == ExportFormat.ONNX: if jitted_model is None: jitted_model = self if output_example is None: if isinstance(input_example, tuple): output_example = self.forward(*input_example) else: output_example = self.forward(input_example) input_names = self.input_module.get_input_names(input_example) output_names = self.output_module.get_output_names(output_example) # dynamic axis is a mapping from input/output_name => list of "dynamic" indices if dynamic_axes is None and use_dynamic_axes: dynamic_axes = self.input_module.get_input_dynamic_axes(input_names) dynamic_axes = {**dynamic_axes, **self.output_module.get_output_dynamic_axes(output_names)} if isinstance(input_example, Dict): input_example = tuple(input_example.values()) torch.onnx.export( jitted_model, input_example, output, input_names=input_names, output_names=output_names, verbose=verbose, export_params=export_params, do_constant_folding=do_constant_folding, keep_initializers_as_inputs=keep_initializers_as_inputs, dynamic_axes=dynamic_axes, opset_version=onnx_opset_version, example_outputs=output_example, ) # Verify the model can be read, and is valid onnx_model = onnx.load(output) onnx.checker.check_model(onnx_model, full_check=True) if do_constant_folding: if not ONNX_GRAPHSURGEON_AVAILABLE: logging.info( f"onnx-graphsurgeon module is not instlled." "That may result in suboptimal optimization of exported ONNX graph (including unneeded DOUBLE initializers)." "Please follow the instructions available at:" "https://github.com/NVIDIA/TensorRT/tree/master/tools/onnx-graphsurgeon" "to install onnx-graphsurgeon from source to improve exported graph." ) else: # This pass is to remove/recast certain constants that are generated as 'double' # Those constants break ONNX -> TRT conversion (TRT does not support 'double' as of 7.2) # Can probably be removed once TRT has automatic downcast for double. # However, it may still be useful even then as it seems to always make the graph shorter. graph = gs.import_onnx(onnx_model) onnx_model = gs.export_onnx(graph.fold_constants().cleanup()) onnx.checker.check_model(onnx_model, full_check=True) onnx.save(onnx_model, output) else: raise ValueError(f'Encountered unknown export format {format}.') finally: typecheck.set_typecheck_enabled(enabled=True) if forward_method: type(self).forward = old_forward_method return exported @property def disabled_deployment_input_names(self): """Implement this method to return a set of input names disabled for export""" return set() @property def disabled_deployment_output_names(self): """Implement this method to return a set of output names disabled for export""" return set() @property def supported_export_formats(self): """Implement this method to return a set of export formats supported. Default is all types.""" return set([ExportFormat.ONNX, ExportFormat.TORCHSCRIPT]) @staticmethod def _extract_dynamic_axes(name: str, ntype: NeuralType): """ Implement this method to provide dynamic axes id for ONNX export. By default, this method will extract BATCH and TIME dimension ids from each provided input/output name argument. For example, if module/model accepts argument named "input_signal" with type corresponding to [Batch, Time, Dim] shape, then the returned result should contain "input_signal" -> [0, 1] because Batch and Time are dynamic axes as they can change from call to call during inference. Args: name: Name of input or output parameter ntype: Corresponding Neural Type Returns: """ dynamic_axes = defaultdict(list) if ntype.axes: for ind, axis in enumerate(ntype.axes): if axis.kind in [AxisKind.Batch, AxisKind.Time, AxisKind.Width, AxisKind.Height]: dynamic_axes[name].append(ind) return dynamic_axes def _prepare_for_export(self, replace_1D_2D=False): """ Override this method to prepare module for export. This is in-place operation. Base version does common necessary module replacements (Apex etc) """ replace_for_export(self, replace_1D_2D) ================================================ FILE: patches/nemo/1.0.0rc1/exportable.py ================================================ # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import os from abc import ABC from collections import defaultdict from enum import Enum from typing import Dict import onnx import torch from nemo.core.classes import typecheck from nemo.core.neural_types import AxisKind, NeuralType from nemo.utils import logging from nemo.utils.export_utils import replace_for_export try: import onnx_graphsurgeon as gs ONNX_GRAPHSURGEON_AVAILABLE = True except (ImportError, ModuleNotFoundError): ONNX_GRAPHSURGEON_AVAILABLE = False __all__ = ['ExportFormat', 'Exportable'] class ExportFormat(Enum): """Which format to use when exporting a Neural Module for deployment""" ONNX = (1,) TORCHSCRIPT = (2,) _EXT_DICT = { ".pt": ExportFormat.TORCHSCRIPT, ".onnx": ExportFormat.ONNX, } class Exportable(ABC): """ This Interface should be implemented by particular classes derived from nemo.core.NeuralModule or nemo.core.ModelPT. It gives these entities ability to be exported for deployment to formats such as ONNX. """ @staticmethod def get_format(filename: str): _, ext = os.path.splitext(filename) try: return _EXT_DICT[ext] except KeyError: raise ValueError(f"Export file {filename} extension does not correspond to any export format!") @property def input_module(self): return self @property def output_module(self): return self def get_input_names(self, input_example): if isinstance(input_example, Dict): input_names = list(input_example.keys()) else: if not (hasattr(self, 'input_types')): raise NotImplementedError( 'For export to work you must define input_types or pass names in input_example' ) input_names = list(self.input_types.keys()) # remove unnecessary inputs for input_ports for name in self.disabled_deployment_input_names: input_names.remove(name) return input_names def get_output_names(self, output_example): if isinstance(output_example, Dict): output_names = list(output_example.keys()) else: if not (hasattr(self, 'output_types')): raise NotImplementedError( 'For export to work you must define output_types or pass names in output_example' ) output_names = list(self.output_types.keys()) # remove unnecessary inputs for input_ports for name in self.disabled_deployment_output_names: output_names.remove(name) return output_names def get_input_dynamic_axes(self, input_names): dynamic_axes = defaultdict(list) for name in input_names: dynamic_axes = { **dynamic_axes, **self._extract_dynamic_axes(name, self.input_types[name]), } return dynamic_axes def get_output_dynamic_axes(self, output_names): dynamic_axes = defaultdict(list) for name in output_names: dynamic_axes = { **dynamic_axes, **self._extract_dynamic_axes(name, self.output_types[name]), } return dynamic_axes def export( self, output: str, input_example=None, output_example=None, verbose=False, export_params=True, do_constant_folding=True, keep_initializers_as_inputs=False, onnx_opset_version: int = 12, try_script: bool = False, set_eval: bool = True, check_trace: bool = True, use_dynamic_axes: bool = True, dynamic_axes=None, check_tolerance=0.01, forward_method=None, ): qual_name = self.__module__ + '.' + self.__class__.__qualname__ output_descr = qual_name + ' exported to ONNX' exported = ([output], [output_descr]) try: # Disable typechecks typecheck.set_typecheck_enabled(enabled=False) # Allow user to completely override forward method to export if forward_method is None and hasattr(type(self), "forward_for_export"): forward_method = type(self).forward_for_export if forward_method: old_forward_method = type(self).forward type(self).forward = forward_method # Set module to eval mode if set_eval: self.eval() format = self.get_format(output) self._prepare_for_export() with torch.jit.optimized_execution(True): jitted_model = None if try_script: try: jitted_model = torch.jit.script(self) except Exception as e: print("jit.script() failed!", e) if input_example is None: input_example = self.input_module.input_example() with torch.jit.optimized_execution(True): if format == ExportFormat.TORCHSCRIPT: if isinstance(input_example, Dict): input_example = tuple(input_example.values()) if jitted_model is None: jitted_model = torch.jit.trace( self, input_example, strict=False, optimize=True, check_trace=check_trace, check_tolerance=check_tolerance, ) jitted_model.save(output) assert os.path.exists(output) elif format == ExportFormat.ONNX: if jitted_model is None: jitted_model = self if output_example is None: if isinstance(input_example, tuple): output_example = self.forward(*input_example) else: output_example = self.forward(input_example) input_names = self.input_module.get_input_names(input_example) output_names = self.output_module.get_output_names(output_example) # dynamic axis is a mapping from input/output_name => list of "dynamic" indices if dynamic_axes is None and use_dynamic_axes: dynamic_axes = self.input_module.get_input_dynamic_axes(input_names) dynamic_axes = {**dynamic_axes, **self.output_module.get_output_dynamic_axes(output_names)} if isinstance(input_example, tuple): logging.info(f'ONNX input_example {len(input_example)}') for idx, x in enumerate(input_example): logging.info(f' - {idx} {x.shape}') """ if len(input_names) < len(input_example): logging.warning(f'removing extra input_examples to match number of input_names') input_example = tuple([input_example[x] for x in range(len(input_names))]) logging.warning(f'new number of input_examples: {len(input_example)}') """ logging.info(f'ONNX class_name {type(self).__name__}') logging.info(f'ONNX input_names {input_names}') logging.info(f'ONNX output_names {output_names}') logging.info(f'ONNX dynamic_axes {dynamic_axes}') if isinstance(input_example, Dict): input_example = tuple(input_example.values()) torch.onnx.export( jitted_model, input_example, output, input_names=input_names, output_names=output_names, verbose=verbose, export_params=export_params, do_constant_folding=do_constant_folding, keep_initializers_as_inputs=keep_initializers_as_inputs, dynamic_axes=dynamic_axes, opset_version=onnx_opset_version, example_outputs=output_example, ) # Verify the model can be read, and is valid onnx_model = onnx.load(output) onnx.checker.check_model(onnx_model, full_check=True) if do_constant_folding: if not ONNX_GRAPHSURGEON_AVAILABLE: logging.info( f"onnx-graphsurgeon module is not instlled." "That may result in suboptimal optimization of exported ONNX graph (including unneeded DOUBLE initializers)." "Please follow the instructions available at:" "https://github.com/NVIDIA/TensorRT/tree/master/tools/onnx-graphsurgeon" "to install onnx-graphsurgeon from source to improve exported graph." ) else: # This pass is to remove/recast certain constants that are generated as 'double' # Those constants break ONNX -> TRT conversion (TRT does not support 'double' as of 7.2) # Can probably be removed once TRT has automatic downcast for double. # However, it may still be useful even then as it seems to always make the graph shorter. graph = gs.import_onnx(onnx_model) onnx_model = gs.export_onnx(graph.fold_constants().cleanup()) onnx.checker.check_model(onnx_model, full_check=True) onnx.save(onnx_model, output) else: raise ValueError(f'Encountered unknown export format {format}.') finally: typecheck.set_typecheck_enabled(enabled=True) if forward_method: type(self).forward = old_forward_method return exported @property def disabled_deployment_input_names(self): """Implement this method to return a set of input names disabled for export""" return set() @property def disabled_deployment_output_names(self): """Implement this method to return a set of output names disabled for export""" return set() @property def supported_export_formats(self): """Implement this method to return a set of export formats supported. Default is all types.""" return set([ExportFormat.ONNX, ExportFormat.TORCHSCRIPT]) @staticmethod def _extract_dynamic_axes(name: str, ntype: NeuralType): """ Implement this method to provide dynamic axes id for ONNX export. By default, this method will extract BATCH and TIME dimension ids from each provided input/output name argument. For example, if module/model accepts argument named "input_signal" with type corresponding to [Batch, Time, Dim] shape, then the returned result should contain "input_signal" -> [0, 1] because Batch and Time are dynamic axes as they can change from call to call during inference. Args: name: Name of input or output parameter ntype: Corresponding Neural Type Returns: """ dynamic_axes = defaultdict(list) if ntype.axes: for ind, axis in enumerate(ntype.axes): if axis.kind in [AxisKind.Batch, AxisKind.Time, AxisKind.Width, AxisKind.Height]: dynamic_axes[name].append(ind) return dynamic_axes def _prepare_for_export(self, replace_1D_2D=False): """ Override this method to prepare module for export. This is in-place operation. Base version does common necessary module replacements (Apex etc) """ replace_for_export(self, replace_1D_2D) ================================================ FILE: patches/nemo/1.0.0rc1/nlp/__init__.py ================================================ # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from nemo.collections.nlp.modules.common.huggingface.albert import AlbertEncoder from nemo.collections.nlp.modules.common.huggingface.bert import BertEncoder from nemo.collections.nlp.modules.common.huggingface.distilbert import DistilBertEncoder from nemo.collections.nlp.modules.common.huggingface.huggingface_utils import ( get_huggingface_lm_model, get_huggingface_pretrained_lm_models_list, ) from nemo.collections.nlp.modules.common.huggingface.roberta import RobertaEncoder from nemo.collections.nlp.modules.common.huggingface.mobilebert import MobileBertEncoder ================================================ FILE: patches/nemo/1.0.0rc1/nlp/distilbert.diff ================================================ 17a18 > from typing import Dict, Optional 19a21 > from nemo.core.neural_types import ChannelType, MaskType, NeuralType 29a32,53 > @property > def input_types(self) -> Optional[Dict[str, NeuralType]]: > """ > These are ordered incorrectly in bert_module.py WRT to QAModel.forward() > DistilBert doesn't use token_type_ids, but the QAModel still needs them during export. > By re-ordring them, the correct input_names are used during export of the ONNX model. > """ > return { > "input_ids": NeuralType(('B', 'T'), ChannelType()), > "token_type_ids": NeuralType(('B', 'T'), ChannelType(), optional=True), > "attention_mask": NeuralType(('B', 'T'), MaskType(), optional=True) > } > > ''' > # note: disabling the token_type_ids here still leads to incorrect names, because QAModel.forward() > # still needs the token_type_ids to run the trace, and hence the input_example is still larger > @property > def disabled_deployment_input_names(self): > """Implement this method to return a set of input names disabled for export""" > return ['token_type_ids'] > ''' > 34a59 > \ No newline at end of file ================================================ FILE: patches/nemo/1.0.0rc1/nlp/distilbert.original.py ================================================ # Copyright 2020 The Google AI Language Team Authors and # The HuggingFace Inc. team. # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from transformers import DistilBertModel from nemo.collections.nlp.modules.common.bert_module import BertModule from nemo.core.classes import typecheck __all__ = ['DistilBertEncoder'] class DistilBertEncoder(DistilBertModel, BertModule): """ Wraps around the Huggingface transformers implementation repository for easy use within NeMo. """ @typecheck() def forward(self, input_ids, attention_mask, token_type_ids=None): # distilBert does not use token_type_ids as the most of the other Bert models res = super().forward(input_ids=input_ids, attention_mask=attention_mask)[0] return res ================================================ FILE: patches/nemo/1.0.0rc1/nlp/distilbert.py ================================================ # Copyright 2020 The Google AI Language Team Authors and # The HuggingFace Inc. team. # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from transformers import DistilBertModel from typing import Dict, Optional from nemo.collections.nlp.modules.common.bert_module import BertModule from nemo.core.neural_types import ChannelType, MaskType, NeuralType from nemo.core.classes import typecheck __all__ = ['DistilBertEncoder'] class DistilBertEncoder(DistilBertModel, BertModule): """ Wraps around the Huggingface transformers implementation repository for easy use within NeMo. """ @property def input_types(self) -> Optional[Dict[str, NeuralType]]: """ These are ordered incorrectly in bert_module.py WRT to QAModel.forward() DistilBert doesn't use token_type_ids, but the QAModel still needs them during export. By re-ordring them, the correct input_names are used during export of the ONNX model. """ return { "input_ids": NeuralType(('B', 'T'), ChannelType()), "token_type_ids": NeuralType(('B', 'T'), ChannelType(), optional=True), "attention_mask": NeuralType(('B', 'T'), MaskType(), optional=True) } ''' # note: disabling the token_type_ids here still leads to incorrect names, because QAModel.forward() # still needs the token_type_ids to run the trace, and hence the input_example is still larger @property def disabled_deployment_input_names(self): """Implement this method to return a set of input names disabled for export""" return ['token_type_ids'] ''' @typecheck() def forward(self, input_ids, attention_mask, token_type_ids=None): # distilBert does not use token_type_ids as the most of the other Bert models res = super().forward(input_ids=input_ids, attention_mask=attention_mask)[0] return res ================================================ FILE: patches/nemo/1.0.0rc1/nlp/huggingface_utils.py ================================================ # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import os from typing import List, Optional from transformers import ( ALBERT_PRETRAINED_MODEL_ARCHIVE_LIST, ALL_PRETRAINED_CONFIG_ARCHIVE_MAP, BERT_PRETRAINED_MODEL_ARCHIVE_LIST, DISTILBERT_PRETRAINED_MODEL_ARCHIVE_LIST, ROBERTA_PRETRAINED_MODEL_ARCHIVE_LIST, MOBILEBERT_PRETRAINED_MODEL_ARCHIVE_LIST, AlbertConfig, AutoModel, BertConfig, DistilBertConfig, RobertaConfig, MobileBertConfig, ) from nemo.collections.nlp.modules.common.huggingface.albert import AlbertEncoder from nemo.collections.nlp.modules.common.huggingface.bert import BertEncoder from nemo.collections.nlp.modules.common.huggingface.distilbert import DistilBertEncoder from nemo.collections.nlp.modules.common.huggingface.roberta import RobertaEncoder from nemo.collections.nlp.modules.common.huggingface.mobilebert import MobileBertEncoder from nemo.utils import logging __all__ = ["get_huggingface_lm_model", "get_huggingface_pretrained_lm_models_list"] HUGGINGFACE_MODELS = { "BertModel": { "default": "bert-base-uncased", "class": BertEncoder, "config": BertConfig, "pretrained_model_list": BERT_PRETRAINED_MODEL_ARCHIVE_LIST, }, "DistilBertModel": { "default": "distilbert-base-uncased", "class": DistilBertEncoder, "config": DistilBertConfig, "pretrained_model_list": DISTILBERT_PRETRAINED_MODEL_ARCHIVE_LIST, }, "RobertaModel": { "default": "roberta-base", "class": RobertaEncoder, "config": RobertaConfig, "pretrained_model_list": ROBERTA_PRETRAINED_MODEL_ARCHIVE_LIST, }, "AlbertModel": { "default": "albert-base-v2", "class": AlbertEncoder, "config": AlbertConfig, "pretrained_model_list": ALBERT_PRETRAINED_MODEL_ARCHIVE_LIST, }, "MobileBertModel": { "default": "google/mobilebert-uncased", "class": MobileBertEncoder, "config": MobileBertConfig, "pretrained_model_list": MOBILEBERT_PRETRAINED_MODEL_ARCHIVE_LIST, }, } def get_huggingface_lm_model( pretrained_model_name: str, config_dict: Optional[dict] = None, config_file: Optional[str] = None, ): """ Returns lm model instantiated with Huggingface Args: pretrained_mode_name: specify this to instantiate pretrained model from Huggingface, e.g. bert-base-cased. For entire list, see get_huggingface_pretrained_lm_models_list(). config_dict: model configuration dictionary used to instantiate Huggingface model from scratch config_file: path to model configuration file used to instantiate Huggingface model from scratch Returns: BertModule """ try: automodel = AutoModel.from_pretrained(pretrained_model_name) except Exception as e: raise ValueError(f"{pretrained_model_name} is not supported by HuggingFace. {e}") model_type = type(automodel).__name__ if model_type in HUGGINGFACE_MODELS: model_class = HUGGINGFACE_MODELS[model_type]["class"] if config_file: if not os.path.exists(config_file): logging.warning( f"Config file was not found at {config_file}. Will attempt to use config_dict or pretrained_model_name." ) else: config_class = HUGGINGFACE_MODELS[model_type]["config"] return model_class(config_class.from_json_file(config_file)) if config_dict: config_class = HUGGINGFACE_MODELS[model_type]["config"] return model_class(config=config_class(**config_dict)) else: return model_class.from_pretrained(pretrained_model_name) else: raise ValueError(f"Use HuffingFace API directly in NeMo for {pretrained_model_name}") def get_huggingface_pretrained_lm_models_list(include_external: bool = False,) -> List[str]: """ Returns the list of pretrained HuggingFace language models Args: include_external if true includes all HuggingFace model names, not only those supported language models in NeMo. Returns the list of HuggingFace models """ huggingface_models = [] if include_external: huggingface_models = list(ALL_PRETRAINED_CONFIG_ARCHIVE_MAP.keys()) else: for model in HUGGINGFACE_MODELS: model_names = HUGGINGFACE_MODELS[model]["pretrained_model_list"] huggingface_models.extend(model_names) return huggingface_models ================================================ FILE: patches/nemo/1.0.0rc1/nlp/location.txt ================================================ nemo/collections/nlp/modules/common/huggingface Main branch. Commit 21a17b267fac68d4cdd20f3969a580a0a40dbdb4 ================================================ FILE: patches/nemo/1.0.0rc1/nlp/mobilebert.py ================================================ # Copyright 2018 The Google AI Language Team Authors and # The HuggingFace Inc. team. # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from transformers import MobileBertModel from nemo.collections.nlp.modules.common.bert_module import BertModule from nemo.core.classes import typecheck __all__ = ['MobileBertEncoder'] class MobileBertEncoder(MobileBertModel, BertModule): """ Wraps around the Huggingface transformers implementation repository for easy use within NeMo. """ @typecheck() def forward(self, input_ids, attention_mask, token_type_ids): res = super().forward(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)[0] return res ================================================ FILE: patches/nemo/1.0.0rc1/setup.original.py ================================================ # ! /usr/bin/python # -*- coding: utf-8 -*- # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """Setup for pip package.""" import codecs import os import subprocess import sys from distutils import cmd as distutils_cmd from distutils import log as distutils_log from itertools import chain import setuptools def is_build_action(): if len(sys.argv) <= 1: return False BUILD_TOKENS = ["egg_info", "dist", "bdist", "sdist", "install", "build", "develop", "style", "clean"] if any([sys.argv[1].startswith(x) for x in BUILD_TOKENS]): return True else: return False if is_build_action(): os.environ['NEMO_PACKAGE_BUILDING'] = 'True' from nemo.package_info import ( __contact_emails__, __contact_names__, __description__, __download_url__, __homepage__, __keywords__, __license__, __package_name__, __repository_url__, __version__, ) if os.path.exists('nemo/README.md'): with open("nemo/README.md", "r") as fh: long_description = fh.read() long_description_content_type = "text/markdown" elif os.path.exists('README.rst'): # codec is used for consistent encoding long_description = codecs.open( os.path.join(os.path.abspath(os.path.dirname(__file__)), 'README.rst'), 'r', 'utf-8', ).read() long_description_content_type = "text/x-rst" else: long_description = 'See ' + __homepage__ ############################################################################### # Dependency Loading # # %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% # def req_file(filename, folder="requirements"): with open(os.path.join(folder, filename)) as f: content = f.readlines() # you may also want to remove whitespace characters # Example: `\n` at the end of each line return [x.strip() for x in content] install_requires = req_file("requirements.txt") extras_require = { # User packages 'test': req_file("requirements_test.txt"), # Collections Packages 'asr': req_file("requirements_asr.txt"), 'cv': req_file("requirements_cv.txt"), 'nlp': req_file("requirements_nlp.txt"), 'tts': req_file("requirements_tts.txt"), } extras_require['all'] = list(chain(extras_require.values())) # TTS depends on ASR extras_require['tts'] = list(chain([extras_require['tts'], extras_require['asr']])) tests_requirements = extras_require["test"] ########################## VERSION MISMATCH PATCH ############################# # REMOVE AFTER 21.03 Container is released ! try: import torch version = torch.__version__ SUPPORTED_TORCH_VERSION = f"torch=={version}" if 'a' in version or 'b' in version: # It is githash release, force to supported Pytorch Lightning branch SUPPORTED_PYTORCH_LIGHTNING = "pytorch-lightning==1.1.5" else: # Downgrade torch, pytorch-lightning SUPPORTED_TORCH_VERSION = "torch<=1.7.1" SUPPORTED_PYTORCH_LIGHTNING = "pytorch-lightning==1.1.5" except (ImportError, ModuleNotFoundError): # Since no torch is installed, pip install torch will install latest torch and latest pytorch lightning SUPPORTED_TORCH_VERSION = "torch<=1.7.1" SUPPORTED_PYTORCH_LIGHTNING = "pytorch-lightning==1.1.5" install_requires_buffer = [] for ix, line in enumerate(install_requires): if 'lightning' in line: install_requires_buffer.append(SUPPORTED_PYTORCH_LIGHTNING) elif 'torch' in line: install_requires_buffer.append(SUPPORTED_TORCH_VERSION) # Pytorch 1.7.1 must use torchtext==0.8.0, torchaudio==0.7.2 and torchvision==0.8.2 if SUPPORTED_TORCH_VERSION == "torch<=1.7.1": install_requires_buffer.append("torchvision==0.8.2") install_requires_buffer.append("torchaudio==0.7.2") install_requires_buffer.append("torchtext==0.8.0") else: install_requires_buffer.append(line) # override install requires install_requires = install_requires_buffer ############################################################################### # Code style checkers # # %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% # class StyleCommand(distutils_cmd.Command): __LINE_WIDTH = 119 __ISORT_BASE = ( 'isort ' # These two lines makes isort compatible with black. '--multi-line=3 --trailing-comma --force-grid-wrap=0 ' f'--use-parentheses --line-width={__LINE_WIDTH} -rc -ws' ) __BLACK_BASE = f'black --skip-string-normalization --line-length={__LINE_WIDTH}' description = 'Checks overall project code style.' user_options = [ ('scope=', None, 'Folder of file to operate within.'), ('fix', None, 'True if tries to fix issues in-place.'), ] def __call_checker(self, base_command, scope, check): command = list(base_command) command.append(scope) if check: command.extend(['--check', '--diff']) self.announce( msg='Running command: %s' % str(' '.join(command)), level=distutils_log.INFO, ) return_code = subprocess.call(command) return return_code def _isort(self, scope, check): return self.__call_checker(base_command=self.__ISORT_BASE.split(), scope=scope, check=check,) def _black(self, scope, check): return self.__call_checker(base_command=self.__BLACK_BASE.split(), scope=scope, check=check,) def _pass(self): self.announce(msg='\033[32mPASS\x1b[0m', level=distutils_log.INFO) def _fail(self): self.announce(msg='\033[31mFAIL\x1b[0m', level=distutils_log.INFO) # noinspection PyAttributeOutsideInit def initialize_options(self): self.scope = '.' self.fix = '' def run(self): scope, check = self.scope, not self.fix isort_return = self._isort(scope=scope, check=check) black_return = self._black(scope=scope, check=check) if isort_return == 0 and black_return == 0: self._pass() else: self._fail() exit(isort_return if isort_return != 0 else black_return) def finalize_options(self): pass ############################################################################### setuptools.setup( name=__package_name__, # Versions should comply with PEP440. For a discussion on single-sourcing # the version across setup.py and the project code, see # https://packaging.python.org/en/latest/single_source_version.html version=__version__, description=__description__, long_description=long_description, long_description_content_type=long_description_content_type, # The project's main homepage. url=__repository_url__, download_url=__download_url__, # Author details author=__contact_names__, author_email=__contact_emails__, # maintainer Details maintainer=__contact_names__, maintainer_email=__contact_emails__, # The licence under which the project is released license=__license__, classifiers=[ # How mature is this project? Common values are # 1 - Planning # 2 - Pre-Alpha # 3 - Alpha # 4 - Beta # 5 - Production/Stable # 6 - Mature # 7 - Inactive 'Development Status :: 4 - Beta', # Indicate who your project is intended for 'Intended Audience :: Developers', 'Intended Audience :: Science/Research', 'Intended Audience :: Information Technology', # Indicate what your project relates to 'Topic :: Scientific/Engineering', 'Topic :: Scientific/Engineering :: Mathematics', 'Topic :: Scientific/Engineering :: Image Recognition', 'Topic :: Scientific/Engineering :: Artificial Intelligence', 'Topic :: Software Development :: Libraries', 'Topic :: Software Development :: Libraries :: Python Modules', 'Topic :: Utilities', # Pick your license as you wish (should match "license" above) 'License :: OSI Approved :: Apache Software License', # Supported python versions 'Programming Language :: Python :: 3', 'Programming Language :: Python :: 3.5', 'Programming Language :: Python :: 3.6', 'Programming Language :: Python :: 3.7', 'Programming Language :: Python :: 3.8', # Additional Setting 'Environment :: Console', 'Natural Language :: English', 'Operating System :: OS Independent', ], packages=setuptools.find_packages(), install_requires=install_requires, setup_requires=['pytest-runner'], tests_require=tests_requirements, # List additional groups of dependencies here (e.g. development # dependencies). You can install these using the following syntax, # $ pip install -e ".[all]" # $ pip install nemo_toolkit[all] extras_require=extras_require, # Add in any packaged data. include_package_data=True, zip_safe=False, # PyPI package information. keywords=__keywords__, # Custom commands. cmdclass={'style': StyleCommand}, ) ================================================ FILE: patches/nemo/1.0.0rc1/setup.py ================================================ # ! /usr/bin/python # -*- coding: utf-8 -*- # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """Setup for pip package.""" import codecs import os import subprocess import sys from distutils import cmd as distutils_cmd from distutils import log as distutils_log from itertools import chain import setuptools def is_build_action(): if len(sys.argv) <= 1: return False BUILD_TOKENS = ["egg_info", "dist", "bdist", "sdist", "install", "build", "develop", "style", "clean"] if any([sys.argv[1].startswith(x) for x in BUILD_TOKENS]): return True else: return False if is_build_action(): os.environ['NEMO_PACKAGE_BUILDING'] = 'True' from nemo.package_info import ( __contact_emails__, __contact_names__, __description__, __download_url__, __homepage__, __keywords__, __license__, __package_name__, __repository_url__, __version__, ) if os.path.exists('nemo/README.md'): with open("nemo/README.md", "r") as fh: long_description = fh.read() long_description_content_type = "text/markdown" elif os.path.exists('README.rst'): # codec is used for consistent encoding long_description = codecs.open( os.path.join(os.path.abspath(os.path.dirname(__file__)), 'README.rst'), 'r', 'utf-8', ).read() long_description_content_type = "text/x-rst" else: long_description = 'See ' + __homepage__ ############################################################################### # Dependency Loading # # %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% # def req_file(filename, folder="requirements"): with open(os.path.join(folder, filename)) as f: content = f.readlines() # you may also want to remove whitespace characters # Example: `\n` at the end of each line return [x.strip() for x in content] install_requires = req_file("requirements.txt") extras_require = { # User packages 'test': req_file("requirements_test.txt"), # Collections Packages 'asr': req_file("requirements_asr.txt"), 'cv': req_file("requirements_cv.txt"), 'nlp': req_file("requirements_nlp.txt"), 'tts': req_file("requirements_tts.txt"), } extras_require['all'] = list(chain(extras_require.values())) # TTS depends on ASR extras_require['tts'] = list(chain([extras_require['tts'], extras_require['asr']])) tests_requirements = extras_require["test"] ########################## VERSION MISMATCH PATCH ############################# # REMOVE AFTER 21.03 Container is released ! try: import torch version = torch.__version__ SUPPORTED_TORCH_VERSION = f"torch=={version}" if 'a' in version or 'b' in version: # It is githash release, force to supported Pytorch Lightning branch SUPPORTED_PYTORCH_LIGHTNING = "pytorch-lightning==1.1.5" else: # Downgrade torch, pytorch-lightning SUPPORTED_TORCH_VERSION = "torch<=1.7.1" SUPPORTED_PYTORCH_LIGHTNING = "pytorch-lightning==1.1.5" except (ImportError, ModuleNotFoundError): # Since no torch is installed, pip install torch will install latest torch and latest pytorch lightning SUPPORTED_TORCH_VERSION = "torch<=1.7.1" SUPPORTED_PYTORCH_LIGHTNING = "pytorch-lightning==1.1.5" install_requires_buffer = [] for ix, line in enumerate(install_requires): if 'lightning' in line: install_requires_buffer.append(SUPPORTED_PYTORCH_LIGHTNING) elif 'torch' in line: install_requires_buffer.append(SUPPORTED_TORCH_VERSION) # Pytorch 1.7.1 must use torchtext==0.8.0, torchaudio==0.7.2 and torchvision==0.8.2 if SUPPORTED_TORCH_VERSION == "torch<=1.7.1": install_requires_buffer.append("torchvision") #"torchvision==0.8.2") # when we built from src in the container, it has a slightly different versions of these torch libraries install_requires_buffer.append("torchaudio") #"torchaudio==0.7.2") install_requires_buffer.append("torchtext") #"torchtext==0.8.0") else: install_requires_buffer.append(line) # override install requires install_requires = install_requires_buffer ############################################################################### # Code style checkers # # %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% # class StyleCommand(distutils_cmd.Command): __LINE_WIDTH = 119 __ISORT_BASE = ( 'isort ' # These two lines makes isort compatible with black. '--multi-line=3 --trailing-comma --force-grid-wrap=0 ' f'--use-parentheses --line-width={__LINE_WIDTH} -rc -ws' ) __BLACK_BASE = f'black --skip-string-normalization --line-length={__LINE_WIDTH}' description = 'Checks overall project code style.' user_options = [ ('scope=', None, 'Folder of file to operate within.'), ('fix', None, 'True if tries to fix issues in-place.'), ] def __call_checker(self, base_command, scope, check): command = list(base_command) command.append(scope) if check: command.extend(['--check', '--diff']) self.announce( msg='Running command: %s' % str(' '.join(command)), level=distutils_log.INFO, ) return_code = subprocess.call(command) return return_code def _isort(self, scope, check): return self.__call_checker(base_command=self.__ISORT_BASE.split(), scope=scope, check=check,) def _black(self, scope, check): return self.__call_checker(base_command=self.__BLACK_BASE.split(), scope=scope, check=check,) def _pass(self): self.announce(msg='\033[32mPASS\x1b[0m', level=distutils_log.INFO) def _fail(self): self.announce(msg='\033[31mFAIL\x1b[0m', level=distutils_log.INFO) # noinspection PyAttributeOutsideInit def initialize_options(self): self.scope = '.' self.fix = '' def run(self): scope, check = self.scope, not self.fix isort_return = self._isort(scope=scope, check=check) black_return = self._black(scope=scope, check=check) if isort_return == 0 and black_return == 0: self._pass() else: self._fail() exit(isort_return if isort_return != 0 else black_return) def finalize_options(self): pass ############################################################################### setuptools.setup( name=__package_name__, # Versions should comply with PEP440. For a discussion on single-sourcing # the version across setup.py and the project code, see # https://packaging.python.org/en/latest/single_source_version.html version=__version__, description=__description__, long_description=long_description, long_description_content_type=long_description_content_type, # The project's main homepage. url=__repository_url__, download_url=__download_url__, # Author details author=__contact_names__, author_email=__contact_emails__, # maintainer Details maintainer=__contact_names__, maintainer_email=__contact_emails__, # The licence under which the project is released license=__license__, classifiers=[ # How mature is this project? Common values are # 1 - Planning # 2 - Pre-Alpha # 3 - Alpha # 4 - Beta # 5 - Production/Stable # 6 - Mature # 7 - Inactive 'Development Status :: 4 - Beta', # Indicate who your project is intended for 'Intended Audience :: Developers', 'Intended Audience :: Science/Research', 'Intended Audience :: Information Technology', # Indicate what your project relates to 'Topic :: Scientific/Engineering', 'Topic :: Scientific/Engineering :: Mathematics', 'Topic :: Scientific/Engineering :: Image Recognition', 'Topic :: Scientific/Engineering :: Artificial Intelligence', 'Topic :: Software Development :: Libraries', 'Topic :: Software Development :: Libraries :: Python Modules', 'Topic :: Utilities', # Pick your license as you wish (should match "license" above) 'License :: OSI Approved :: Apache Software License', # Supported python versions 'Programming Language :: Python :: 3', 'Programming Language :: Python :: 3.5', 'Programming Language :: Python :: 3.6', 'Programming Language :: Python :: 3.7', 'Programming Language :: Python :: 3.8', # Additional Setting 'Environment :: Console', 'Natural Language :: English', 'Operating System :: OS Independent', ], packages=setuptools.find_packages(), install_requires=install_requires, setup_requires=['pytest-runner'], tests_require=tests_requirements, # List additional groups of dependencies here (e.g. development # dependencies). You can install these using the following syntax, # $ pip install -e ".[all]" # $ pip install nemo_toolkit[all] extras_require=extras_require, # Add in any packaged data. include_package_data=True, zip_safe=False, # PyPI package information. keywords=__keywords__, # Custom commands. cmdclass={'style': StyleCommand}, ) ================================================ FILE: patches/nemo/1.6.2/requirements.original.txt ================================================ numpy>=1.21 onnx>=1.7.0 python-dateutil torch wrapt ruamel.yaml scikit-learn sentencepiece<1.0.0 tqdm>=4.41.0 numba wget frozendict unidecode ================================================ FILE: patches/nemo/1.6.2/requirements.txt ================================================ numpy onnx>=1.7.0 python-dateutil torch wrapt ruamel.yaml scikit-learn sentencepiece<1.0.0 tqdm>=4.41.0 numba wget frozendict unidecode ================================================ FILE: patches/nemo/1.6.2/requirements_nlp.original.txt ================================================ boto3 h5py matplotlib>=3.3.2 sentencepiece youtokentome>=1.0.5 numpy rapidfuzz gdown inflect sacrebleu[ja] sacremoses>=0.0.43 nltk>=3.6.5 fasttext opencc pangu jieba ftfy ================================================ FILE: patches/nemo/1.6.2/requirements_nlp.txt ================================================ boto3 h5py matplotlib sentencepiece youtokentome>=1.0.5 numpy gdown inflect sacremoses>=0.0.43 nltk>=3.6.5 fasttext opencc pangu jieba ftfy ================================================ FILE: patches/pytorch/1.6.0/functional.diff ================================================ 2a3,5 > import librosa # STFT patch for aarch64 > import numpy as np > 465c468,478 < return _VF.stft(input, n_fft, hop_length, win_length, window, normalized, onesided) --- > > # STFT patch for aarch64 > # https://stackoverflow.com/a/66872148 > librosa_stft = librosa.stft(input.cpu().detach().numpy().reshape(-1), n_fft, hop_length, win_length, window="hann", center=center, pad_mode=pad_mode) > librosa_stft = np.array([[a.real, a.imag] for a in librosa_stft]) > librosa_stft = np.transpose(librosa_stft, axes=[0, 2, 1]) > librosa_stft = np.expand_dims(librosa_stft, 0) > librosa_stft = torch.from_numpy(librosa_stft) > return librosa_stft > #return _VF.stft(input, n_fft, hop_length, win_length, window, # type: ignore > # normalized, onesided, return_complex) ================================================ FILE: patches/pytorch/1.6.0/functional.original.py ================================================ from typing import Tuple, Optional import torch import torch.nn.functional as F from ._lowrank import svd_lowrank, pca_lowrank from ._overrides import has_torch_function, handle_torch_function from ._jit_internal import boolean_dispatch, List from ._jit_internal import _overload as overload Tensor = torch.Tensor from torch import _VF __all__ = [ 'align_tensors', 'broadcast_tensors', 'cartesian_prod', 'block_diag', 'cdist', 'chain_matmul', 'einsum', 'istft', 'lu', 'lu_unpack', 'norm', 'meshgrid', 'pca_lowrank', 'split', 'stft', 'svd_lowrank', 'tensordot', 'unique', 'unique_consecutive', ] def broadcast_tensors(*tensors): r"""broadcast_tensors(*tensors) -> List of Tensors Broadcasts the given tensors according to :ref:`broadcasting-semantics`. Args: *tensors: any number of tensors of the same type .. warning:: More than one element of a broadcasted tensor may refer to a single memory location. As a result, in-place operations (especially ones that are vectorized) may result in incorrect behavior. If you need to write to the tensors, please clone them first. Example:: >>> x = torch.arange(3).view(1, 3) >>> y = torch.arange(2).view(2, 1) >>> a, b = torch.broadcast_tensors(x, y) >>> a.size() torch.Size([2, 3]) >>> a tensor([[0, 1, 2], [0, 1, 2]]) """ if not torch.jit.is_scripting(): if any(type(t) is not Tensor for t in tensors) and has_torch_function(tensors): return handle_torch_function(broadcast_tensors, tensors, *tensors) return _VF.broadcast_tensors(tensors) def split(tensor, split_size_or_sections, dim=0): r"""Splits the tensor into chunks. Each chunk is a view of the original tensor. If :attr:`split_size_or_sections` is an integer type, then :attr:`tensor` will be split into equally sized chunks (if possible). Last chunk will be smaller if the tensor size along the given dimension :attr:`dim` is not divisible by :attr:`split_size`. If :attr:`split_size_or_sections` is a list, then :attr:`tensor` will be split into ``len(split_size_or_sections)`` chunks with sizes in :attr:`dim` according to :attr:`split_size_or_sections`. Arguments: tensor (Tensor): tensor to split. split_size_or_sections (int) or (list(int)): size of a single chunk or list of sizes for each chunk dim (int): dimension along which to split the tensor. Example:: >>> a = torch.arange(10).reshape(5,2) >>> a tensor([[0, 1], [2, 3], [4, 5], [6, 7], [8, 9]]) >>> torch.split(a, 2) (tensor([[0, 1], [2, 3]]), tensor([[4, 5], [6, 7]]), tensor([[8, 9]])) >>> torch.split(a, [1,4]) (tensor([[0, 1]]), tensor([[2, 3], [4, 5], [6, 7], [8, 9]])) """ if not torch.jit.is_scripting(): if type(tensor) is not Tensor and has_torch_function((tensor,)): return handle_torch_function(split, (tensor,), tensor, split_size_or_sections, dim=dim) # Overwriting reason: # This dispatches to two ATen functions depending on the type of # split_size_or_sections. The branching code is in tensor.py, which we # call here. return tensor.split(split_size_or_sections, dim) # equivalent to itertools.product(indices) def _indices_product(indices): # type: (List[int]) -> (List[List[int]]) empty_list = torch.jit.annotate(List[int], []) result = [empty_list] for idx in indices: result_temp = torch.jit.annotate(List[List[int]], []) for res in result: for i in range(idx): result_temp.append(res + [i]) result = result_temp return result def _index_tensor_with_indices_list(tensor, indices): # type: (Tensor, List[int]) -> Tensor out = tensor for index in indices: out = out[index] return out def lu_unpack(LU_data, LU_pivots, unpack_data=True, unpack_pivots=True): # type: (Tensor, Tensor, bool, bool) -> (Tuple[Optional[Tensor], Optional[Tensor], Optional[Tensor]]) r"""Unpacks the data and pivots from a LU factorization of a tensor. Returns a tuple of tensors as ``(the pivots, the L tensor, the U tensor)``. Arguments: LU_data (Tensor): the packed LU factorization data LU_pivots (Tensor): the packed LU factorization pivots unpack_data (bool): flag indicating if the data should be unpacked unpack_pivots (bool): flag indicating if the pivots should be unpacked Examples:: >>> A = torch.randn(2, 3, 3) >>> A_LU, pivots = A.lu() >>> P, A_L, A_U = torch.lu_unpack(A_LU, pivots) >>> >>> # can recover A from factorization >>> A_ = torch.bmm(P, torch.bmm(A_L, A_U)) >>> # LU factorization of a rectangular matrix: >>> A = torch.randn(2, 3, 2) >>> A_LU, pivots = A.lu() >>> P, A_L, A_U = torch.lu_unpack(A_LU, pivots) >>> P tensor([[[1., 0., 0.], [0., 1., 0.], [0., 0., 1.]], [[0., 0., 1.], [0., 1., 0.], [1., 0., 0.]]]) >>> A_L tensor([[[ 1.0000, 0.0000], [ 0.4763, 1.0000], [ 0.3683, 0.1135]], [[ 1.0000, 0.0000], [ 0.2957, 1.0000], [-0.9668, -0.3335]]]) >>> A_U tensor([[[ 2.1962, 1.0881], [ 0.0000, -0.8681]], [[-1.0947, 0.3736], [ 0.0000, 0.5718]]]) >>> A_ = torch.bmm(P, torch.bmm(A_L, A_U)) >>> torch.norm(A_ - A) tensor(2.9802e-08) """ if not torch.jit.is_scripting(): tens_ops = (LU_data, LU_pivots) if any([type(t) is not Tensor for t in tens_ops]) and has_torch_function(tens_ops): return handle_torch_function( lu_unpack, tens_ops, LU_data, LU_pivots, unpack_data=unpack_data, unpack_pivots=unpack_pivots) shape = LU_data.shape # In generalized LU factorization, the following shape relations hold: # A.shape[-2:] == (m, n) # P.shape[-2:] == (m, m) # L.shape[-2:] == (m, k) # U.shape[-2:] == (k, n) # where k = min(m, n) m, n = shape[-2:] k = min(m, n) if unpack_data: U = LU_data.triu() if m != k: U = U.narrow(-2, 0, k) L = LU_data.tril() if k != n: L = L.narrow(-1, 0, k) L.diagonal(dim1=-2, dim2=-1).fill_(1) else: L = U = None if unpack_pivots: LU_pivots_zero_idx = LU_pivots - 1 if LU_data.dim() > 2: P = torch.eye(m, device=LU_data.device, dtype=LU_data.dtype) \ .expand(shape[:-1] + (m,)) \ .clone(memory_format=torch.contiguous_format) # TODO: rewrite when TorchScript supports product and map as # product(*map(lambda x: list(range(x)), shape[:-2])) when issue 33781 is fixed indices = _indices_product(shape[:-2]) for idx in indices: final_order = [i for i in range(m)] # noqa: C416 TODO: rewrite as list(range(m)) for k, j in enumerate(_index_tensor_with_indices_list(LU_pivots_zero_idx, idx)): final_order[k], final_order[j] = final_order[j], final_order[k] # TODO: remove _index_tensor_with_indices_list when TorchScript supports indexing Tensor with list p_idx = _index_tensor_with_indices_list(P, idx) p_idx.copy_(p_idx.index_select(1, torch.as_tensor(final_order, device=LU_pivots.device))) else: P = torch.eye(m, device=LU_data.device, dtype=LU_data.dtype) final_order = [i for i in range(m)] # noqa: C416 TODO: rewrite as list(range(m)) for k, j, in enumerate(LU_pivots_zero_idx): final_order[k], final_order[j] = final_order[j], final_order[k] P = P.index_select(1, torch.as_tensor(final_order, device=LU_pivots.device)) else: P = None return P, L, U def einsum(equation, *operands): r"""einsum(equation, *operands) -> Tensor This function provides a way of computing multilinear expressions (i.e. sums of products) using the Einstein summation convention. Args: equation (string): The equation is given in terms of lower case letters (indices) to be associated with each dimension of the operands and result. The left hand side lists the operands dimensions, separated by commas. There should be one index letter per tensor dimension. The right hand side follows after `->` and gives the indices for the output. If the `->` and right hand side are omitted, it implicitly defined as the alphabetically sorted list of all indices appearing exactly once in the left hand side. The indices not apprearing in the output are summed over after multiplying the operands entries. If an index appears several times for the same operand, a diagonal is taken. Ellipses `...` represent a fixed number of dimensions. If the right hand side is inferred, the ellipsis dimensions are at the beginning of the output. operands (Tensor): The operands to compute the Einstein sum of. .. note:: This function does not optimize the given expression, so a different formula for the same computation may run faster or consume less memory. Projects like opt_einsum (https://optimized-einsum.readthedocs.io/en/stable/) can optimize the formula for you. Examples:: >>> x = torch.randn(5) >>> y = torch.randn(4) >>> torch.einsum('i,j->ij', x, y) # outer product tensor([[-0.0570, -0.0286, -0.0231, 0.0197], [ 1.2616, 0.6335, 0.5113, -0.4351], [ 1.4452, 0.7257, 0.5857, -0.4984], [-0.4647, -0.2333, -0.1883, 0.1603], [-1.1130, -0.5588, -0.4510, 0.3838]]) >>> A = torch.randn(3,5,4) >>> l = torch.randn(2,5) >>> r = torch.randn(2,4) >>> torch.einsum('bn,anm,bm->ba', l, A, r) # compare torch.nn.functional.bilinear tensor([[-0.3430, -5.2405, 0.4494], [ 0.3311, 5.5201, -3.0356]]) >>> As = torch.randn(3,2,5) >>> Bs = torch.randn(3,5,4) >>> torch.einsum('bij,bjk->bik', As, Bs) # batch matrix multiplication tensor([[[-1.0564, -1.5904, 3.2023, 3.1271], [-1.6706, -0.8097, -0.8025, -2.1183]], [[ 4.2239, 0.3107, -0.5756, -0.2354], [-1.4558, -0.3460, 1.5087, -0.8530]], [[ 2.8153, 1.8787, -4.3839, -1.2112], [ 0.3728, -2.1131, 0.0921, 0.8305]]]) >>> A = torch.randn(3, 3) >>> torch.einsum('ii->i', A) # diagonal tensor([-0.7825, 0.8291, -0.1936]) >>> A = torch.randn(4, 3, 3) >>> torch.einsum('...ii->...i', A) # batch diagonal tensor([[-1.0864, 0.7292, 0.0569], [-0.9725, -1.0270, 0.6493], [ 0.5832, -1.1716, -1.5084], [ 0.4041, -1.1690, 0.8570]]) >>> A = torch.randn(2, 3, 4, 5) >>> torch.einsum('...ij->...ji', A).shape # batch permute torch.Size([2, 3, 5, 4]) """ if not torch.jit.is_scripting(): if any(type(t) is not Tensor for t in operands) and has_torch_function(operands): return handle_torch_function(einsum, operands, equation, *operands) if len(operands) == 1 and isinstance(operands[0], (list, tuple)): # the old interface of passing the operands as one list argument operands = operands[0] # recurse incase operands contains value that has torch function # in the original implementation this line is omitted return einsum(equation, *operands) return _VF.einsum(equation, operands) def meshgrid(*tensors): r"""Take :math:`N` tensors, each of which can be either scalar or 1-dimensional vector, and create :math:`N` N-dimensional grids, where the :math:`i` :sup:`th` grid is defined by expanding the :math:`i` :sup:`th` input over dimensions defined by other inputs. Args: tensors (list of Tensor): list of scalars or 1 dimensional tensors. Scalars will be treated as tensors of size :math:`(1,)` automatically Returns: seq (sequence of Tensors): If the input has :math:`k` tensors of size :math:`(N_1,), (N_2,), \ldots , (N_k,)`, then the output would also have :math:`k` tensors, where all tensors are of size :math:`(N_1, N_2, \ldots , N_k)`. Example:: >>> x = torch.tensor([1, 2, 3]) >>> y = torch.tensor([4, 5, 6]) >>> grid_x, grid_y = torch.meshgrid(x, y) >>> grid_x tensor([[1, 1, 1], [2, 2, 2], [3, 3, 3]]) >>> grid_y tensor([[4, 5, 6], [4, 5, 6], [4, 5, 6]]) """ if not torch.jit.is_scripting(): if any(type(t) is not Tensor for t in tensors) and has_torch_function(tensors): return handle_torch_function(meshgrid, tensors, *tensors) if len(tensors) == 1 and isinstance(tensors[0], (list, tuple)): # the old interface of passing the operands as one list argument tensors = tensors[0] return _VF.meshgrid(tensors) def stft(input, n_fft, hop_length=None, win_length=None, window=None, center=True, pad_mode='reflect', normalized=False, onesided=True): # type: (Tensor, int, Optional[int], Optional[int], Optional[Tensor], bool, str, bool, bool) -> Tensor r"""Short-time Fourier transform (STFT). Ignoring the optional batch dimension, this method computes the following expression: .. math:: X[m, \omega] = \sum_{k = 0}^{\text{win\_length-1}}% \text{window}[k]\ \text{input}[m \times \text{hop\_length} + k]\ % \exp\left(- j \frac{2 \pi \cdot \omega k}{\text{win\_length}}\right), where :math:`m` is the index of the sliding window, and :math:`\omega` is the frequency that :math:`0 \leq \omega < \text{n\_fft}`. When :attr:`onesided` is the default value ``True``, * :attr:`input` must be either a 1-D time sequence or a 2-D batch of time sequences. * If :attr:`hop_length` is ``None`` (default), it is treated as equal to ``floor(n_fft / 4)``. * If :attr:`win_length` is ``None`` (default), it is treated as equal to :attr:`n_fft`. * :attr:`window` can be a 1-D tensor of size :attr:`win_length`, e.g., from :meth:`torch.hann_window`. If :attr:`window` is ``None`` (default), it is treated as if having :math:`1` everywhere in the window. If :math:`\text{win\_length} < \text{n\_fft}`, :attr:`window` will be padded on both sides to length :attr:`n_fft` before being applied. * If :attr:`center` is ``True`` (default), :attr:`input` will be padded on both sides so that the :math:`t`-th frame is centered at time :math:`t \times \text{hop\_length}`. Otherwise, the :math:`t`-th frame begins at time :math:`t \times \text{hop\_length}`. * :attr:`pad_mode` determines the padding method used on :attr:`input` when :attr:`center` is ``True``. See :meth:`torch.nn.functional.pad` for all available options. Default is ``"reflect"``. * If :attr:`onesided` is ``True`` (default), only values for :math:`\omega` in :math:`\left[0, 1, 2, \dots, \left\lfloor \frac{\text{n\_fft}}{2} \right\rfloor + 1\right]` are returned because the real-to-complex Fourier transform satisfies the conjugate symmetry, i.e., :math:`X[m, \omega] = X[m, \text{n\_fft} - \omega]^*`. * If :attr:`normalized` is ``True`` (default is ``False``), the function returns the normalized STFT results, i.e., multiplied by :math:`(\text{frame\_length})^{-0.5}`. Returns the real and the imaginary parts together as one tensor of size :math:`(* \times N \times T \times 2)`, where :math:`*` is the optional batch size of :attr:`input`, :math:`N` is the number of frequencies where STFT is applied, :math:`T` is the total number of frames used, and each pair in the last dimension represents a complex number as the real part and the imaginary part. .. warning:: This function changed signature at version 0.4.1. Calling with the previous signature may cause error or return incorrect result. Arguments: input (Tensor): the input tensor n_fft (int): size of Fourier transform hop_length (int, optional): the distance between neighboring sliding window frames. Default: ``None`` (treated as equal to ``floor(n_fft / 4)``) win_length (int, optional): the size of window frame and STFT filter. Default: ``None`` (treated as equal to :attr:`n_fft`) window (Tensor, optional): the optional window function. Default: ``None`` (treated as window of all :math:`1` s) center (bool, optional): whether to pad :attr:`input` on both sides so that the :math:`t`-th frame is centered at time :math:`t \times \text{hop\_length}`. Default: ``True`` pad_mode (string, optional): controls the padding method used when :attr:`center` is ``True``. Default: ``"reflect"`` normalized (bool, optional): controls whether to return the normalized STFT results Default: ``False`` onesided (bool, optional): controls whether to return half of results to avoid redundancy Default: ``True`` Returns: Tensor: A tensor containing the STFT result with shape described above """ if not torch.jit.is_scripting(): if type(input) is not Tensor and has_torch_function((input,)): return handle_torch_function( stft, (input,), input, n_fft, hop_length=hop_length, win_length=win_length, window=window, center=center, pad_mode=pad_mode, normalized=normalized, onesided=onesided) # TODO: after having proper ways to map Python strings to ATen Enum, move # this and F.pad to ATen. if center: signal_dim = input.dim() extended_shape = [1] * (3 - signal_dim) + list(input.size()) pad = int(n_fft // 2) input = F.pad(input.view(extended_shape), (pad, pad), pad_mode) input = input.view(input.shape[-signal_dim:]) return _VF.stft(input, n_fft, hop_length, win_length, window, normalized, onesided) def istft(input, n_fft, hop_length=None, win_length=None, window=None, center=True, normalized=False, onesided=True, length=None): # type: (Tensor, int, Optional[int], Optional[int], Optional[Tensor], bool, bool, bool, Optional[int]) -> Tensor r"""Inverse short time Fourier Transform. This is expected to be the inverse of :func:`~torch.stft`. It has the same parameters (+ additional optional parameter of :attr:`length`) and it should return the least squares estimation of the original signal. The algorithm will check using the NOLA condition ( nonzero overlap). Important consideration in the parameters :attr:`window` and :attr:`center` so that the envelop created by the summation of all the windows is never zero at certain point in time. Specifically, :math:`\sum_{t=-\infty}^{\infty} w^2[n-t\times hop\_length] \cancel{=} 0`. Since :func:`~torch.stft` discards elements at the end of the signal if they do not fit in a frame, ``istft`` may return a shorter signal than the original signal (can occur if :attr:`center` is False since the signal isn't padded). If :attr:`center` is ``True``, then there will be padding e.g. ``'constant'``, ``'reflect'``, etc. Left padding can be trimmed off exactly because they can be calculated but right padding cannot be calculated without additional information. Example: Suppose the last window is: ``[17, 18, 0, 0, 0]`` vs ``[18, 0, 0, 0, 0]`` The :attr:`n_fft`, :attr:`hop_length`, :attr:`win_length` are all the same which prevents the calculation of right padding. These additional values could be zeros or a reflection of the signal so providing :attr:`length` could be useful. If :attr:`length` is ``None`` then padding will be aggressively removed (some loss of signal). [1] D. W. Griffin and J. S. Lim, "Signal estimation from modified short-time Fourier transform," IEEE Trans. ASSP, vol.32, no.2, pp.236-243, Apr. 1984. Arguments: input (Tensor): The input tensor. Expected to be output of :func:`~torch.stft`, either 3D (``fft_size``, ``n_frame``, 2) or 4D (``channel``, ``fft_size``, ``n_frame``, 2). n_fft (int): Size of Fourier transform hop_length (Optional[int]): The distance between neighboring sliding window frames. (Default: ``n_fft // 4``) win_length (Optional[int]): The size of window frame and STFT filter. (Default: ``n_fft``) window (Optional[torch.Tensor]): The optional window function. (Default: ``torch.ones(win_length)``) center (bool): Whether :attr:`input` was padded on both sides so that the :math:`t`-th frame is centered at time :math:`t \times \text{hop\_length}`. (Default: ``True``) normalized (bool): Whether the STFT was normalized. (Default: ``False``) onesided (bool): Whether the STFT is onesided. (Default: ``True``) length (Optional[int]): The amount to trim the signal by (i.e. the original signal length). (Default: whole signal) Returns: Tensor: Least squares estimation of the original signal of size (..., signal_length) """ if not torch.jit.is_scripting(): if type(input) is not Tensor and has_torch_function((input,)): return handle_torch_function( istft, (input,), input, n_fft, hop_length=hop_length, win_length=win_length, window=window, center=center, normalized=normalized, onesided=onesided, length=length) return _VF.istft( input, n_fft, hop_length, win_length, window, center, normalized, onesided, length) del torch.unique_dim def _unique_impl(input, sorted=True, return_inverse=False, return_counts=False, dim=None): # type: (Tensor, bool, bool, bool, Optional[int]) -> Tuple[Tensor, Tensor, Tensor] r"""Returns the unique elements of the input tensor. .. note:: This function is different from :func:`torch.unique_consecutive` in the sense that this function also eliminates non-consecutive duplicate values. .. note:: Currently in the CUDA implementation and the CPU implementation when dim is specified, `torch.unique` always sort the tensor at the beginning regardless of the `sort` argument. Sorting could be slow, so if your input tensor is already sorted, it is recommended to use :func:`torch.unique_consecutive` which avoids the sorting. Arguments: input (Tensor): the input tensor sorted (bool): Whether to sort the unique elements in ascending order before returning as output. return_inverse (bool): Whether to also return the indices for where elements in the original input ended up in the returned unique list. return_counts (bool): Whether to also return the counts for each unique element. dim (int): the dimension to apply unique. If ``None``, the unique of the flattened input is returned. default: ``None`` Returns: (Tensor, Tensor (optional), Tensor (optional)): A tensor or a tuple of tensors containing - **output** (*Tensor*): the output list of unique scalar elements. - **inverse_indices** (*Tensor*): (optional) if :attr:`return_inverse` is True, there will be an additional returned tensor (same shape as input) representing the indices for where elements in the original input map to in the output; otherwise, this function will only return a single tensor. - **counts** (*Tensor*): (optional) if :attr:`return_counts` is True, there will be an additional returned tensor (same shape as output or output.size(dim), if dim was specified) representing the number of occurrences for each unique value or tensor. Example:: >>> output = torch.unique(torch.tensor([1, 3, 2, 3], dtype=torch.long)) >>> output tensor([ 2, 3, 1]) >>> output, inverse_indices = torch.unique( torch.tensor([1, 3, 2, 3], dtype=torch.long), sorted=True, return_inverse=True) >>> output tensor([ 1, 2, 3]) >>> inverse_indices tensor([ 0, 2, 1, 2]) >>> output, inverse_indices = torch.unique( torch.tensor([[1, 3], [2, 3]], dtype=torch.long), sorted=True, return_inverse=True) >>> output tensor([ 1, 2, 3]) >>> inverse_indices tensor([[ 0, 2], [ 1, 2]]) """ if not torch.jit.is_scripting(): if type(input) is not Tensor and has_torch_function((input,)): return handle_torch_function( unique, (input,), input, sorted=sorted, return_inverse=return_inverse, return_counts=return_counts, dim=dim) if dim is not None: output, inverse_indices, counts = _VF.unique_dim( input, dim, sorted=sorted, return_inverse=return_inverse, return_counts=return_counts, ) else: output, inverse_indices, counts = torch._unique2( input, sorted=sorted, return_inverse=return_inverse, return_counts=return_counts, ) return output, inverse_indices, counts def _unique_consecutive_impl(input, return_inverse=False, return_counts=False, dim=None): # type: (Tensor, bool, bool, Optional[int]) -> Tuple[Tensor, Tensor, Tensor] r"""Eliminates all but the first element from every consecutive group of equivalent elements. .. note:: This function is different from :func:`torch.unique` in the sense that this function only eliminates consecutive duplicate values. This semantics is similar to `std::unique` in C++. Arguments: input (Tensor): the input tensor return_inverse (bool): Whether to also return the indices for where elements in the original input ended up in the returned unique list. return_counts (bool): Whether to also return the counts for each unique element. dim (int): the dimension to apply unique. If ``None``, the unique of the flattened input is returned. default: ``None`` Returns: (Tensor, Tensor (optional), Tensor (optional)): A tensor or a tuple of tensors containing - **output** (*Tensor*): the output list of unique scalar elements. - **inverse_indices** (*Tensor*): (optional) if :attr:`return_inverse` is True, there will be an additional returned tensor (same shape as input) representing the indices for where elements in the original input map to in the output; otherwise, this function will only return a single tensor. - **counts** (*Tensor*): (optional) if :attr:`return_counts` is True, there will be an additional returned tensor (same shape as output or output.size(dim), if dim was specified) representing the number of occurrences for each unique value or tensor. Example:: >>> x = torch.tensor([1, 1, 2, 2, 3, 1, 1, 2]) >>> output = torch.unique_consecutive(x) >>> output tensor([1, 2, 3, 1, 2]) >>> output, inverse_indices = torch.unique_consecutive(x, return_inverse=True) >>> output tensor([1, 2, 3, 1, 2]) >>> inverse_indices tensor([0, 0, 1, 1, 2, 3, 3, 4]) >>> output, counts = torch.unique_consecutive(x, return_counts=True) >>> output tensor([1, 2, 3, 1, 2]) >>> counts tensor([2, 2, 1, 2, 1]) """ if not torch.jit.is_scripting(): if type(input) is not Tensor and has_torch_function((input,)): return handle_torch_function( unique_consecutive, (input,), input, return_inverse=return_inverse, return_counts=return_counts, dim=dim) output, inverse_indices, counts = _VF.unique_consecutive( input, return_inverse=return_inverse, return_counts=return_counts, dim=dim) return output, inverse_indices, counts def _return_counts(input, sorted=True, return_inverse=False, return_counts=False, dim=None): # type: (Tensor, bool, bool, bool, Optional[int]) -> Tuple[Tensor, Tensor] if not torch.jit.is_scripting(): if type(input) is not Tensor and has_torch_function((input,)): return _unique_impl(input, sorted, return_inverse, return_counts, dim) output, _, counts = _unique_impl(input, sorted, return_inverse, return_counts, dim) return output, counts def _return_output(input, sorted=True, return_inverse=False, return_counts=False, dim=None): # type: (Tensor, bool, bool, bool, Optional[int]) -> Tensor if not torch.jit.is_scripting(): if type(input) is not Tensor and has_torch_function((input,)): return _unique_impl(input, sorted, return_inverse, return_counts, dim) output, _, _ = _unique_impl(input, sorted, return_inverse, return_counts, dim) return output def _return_inverse(input, sorted=True, return_inverse=False, return_counts=False, dim=None): # type: (Tensor, bool, bool, bool, Optional[int]) -> Tuple[Tensor, Tensor] if not torch.jit.is_scripting(): if type(input) is not Tensor and has_torch_function((input,)): return _unique_impl(input, sorted, return_inverse, return_counts, dim) output, inverse_indices, _ = _unique_impl(input, sorted, return_inverse, return_counts, dim) return output, inverse_indices _return_inverse_false = boolean_dispatch( arg_name='return_counts', arg_index=3, default=False, if_true=_return_counts, if_false=_return_output, module_name=__name__, func_name='unique') _return_inverse_true = boolean_dispatch( arg_name='return_counts', arg_index=3, default=False, if_true=_unique_impl, if_false=_return_inverse, module_name=__name__, func_name='unique') # The return type of unique depends on `return_inverse`, and `return_counts` so in order to # resolve the output type in TorchScript we need to statically know the value of both parameters unique = boolean_dispatch( arg_name='return_inverse', arg_index=2, default=False, if_true=_return_inverse_true, if_false=_return_inverse_false, module_name=__name__, func_name='unique') unique.__doc__ = _unique_impl.__doc__ def _consecutive_return_counts(input, return_inverse=False, return_counts=False, dim=None): # type: (Tensor, bool, bool, Optional[int]) -> Tuple[Tensor, Tensor] if not torch.jit.is_scripting(): if type(input) is not Tensor and has_torch_function((input,)): return _unique_consecutive_impl(input, return_inverse, return_counts, dim) output, _, counts = _unique_consecutive_impl(input, return_inverse, return_counts, dim) return output, counts def _consecutive_return_output(input, return_inverse=False, return_counts=False, dim=None): # type: (Tensor, bool, bool, Optional[int]) -> Tensor if not torch.jit.is_scripting(): if type(input) is not Tensor and has_torch_function((input,)): return _unique_consecutive_impl(input, return_inverse, return_counts, dim) output, _, _ = _unique_consecutive_impl(input, return_inverse, return_counts, dim) return output def _consecutive_return_inverse(input, return_inverse=False, return_counts=False, dim=None): # type: (Tensor, bool, bool, Optional[int]) -> Tuple[Tensor, Tensor] if not torch.jit.is_scripting(): if type(input) is not Tensor and has_torch_function((input,)): return _unique_consecutive_impl(input, return_inverse, return_counts, dim) output, inverse_indices, _ = _unique_consecutive_impl(input, return_inverse, return_counts, dim) return output, inverse_indices _consecutive_return_inverse_false = boolean_dispatch( arg_name='return_counts', arg_index=1, default=False, if_true=_consecutive_return_counts, if_false=_consecutive_return_output, module_name=__name__, func_name='unique_consecutive') _consecutive_return_inverse_true = boolean_dispatch( arg_name='return_counts', arg_index=1, default=False, if_true=_unique_consecutive_impl, if_false=_consecutive_return_inverse, module_name=__name__, func_name='unique_consecutive') # The return type of unique depends on `return_inverse`, and `return_counts` so in order to # resolve the output type in TorchScript we need to statically know the value of both parameters unique_consecutive = boolean_dispatch( arg_name='return_inverse', arg_index=2, default=False, if_true=_consecutive_return_inverse_true, if_false=_consecutive_return_inverse_false, module_name=__name__, func_name='unique_consecutive') unique_consecutive.__doc__ = _unique_consecutive_impl.__doc__ def tensordot(a, b, dims=2): r"""Returns a contraction of a and b over multiple dimensions. :attr:`tensordot` implements a generalized matrix product. Args: a (Tensor): Left tensor to contract b (Tensor): Right tensor to contract dims (int or tuple of two lists of integers): number of dimensions to contract or explicit lists of dimensions for :attr:`a` and :attr:`b` respectively When called with a non-negative integer argument :attr:`dims` = :math:`d`, and the number of dimensions of :attr:`a` and :attr:`b` is :math:`m` and :math:`n`, respectively, :func:`~torch.tensordot` computes .. math:: r_{i_0,...,i_{m-d}, i_d,...,i_n} = \sum_{k_0,...,k_{d-1}} a_{i_0,...,i_{m-d},k_0,...,k_{d-1}} \times b_{k_0,...,k_{d-1}, i_d,...,i_n}. When called with :attr:`dims` of the list form, the given dimensions will be contracted in place of the last :math:`d` of :attr:`a` and the first :math:`d` of :math:`b`. The sizes in these dimensions must match, but :func:`~torch.tensordot` will deal with broadcasted dimensions. Examples:: >>> a = torch.arange(60.).reshape(3, 4, 5) >>> b = torch.arange(24.).reshape(4, 3, 2) >>> torch.tensordot(a, b, dims=([1, 0], [0, 1])) tensor([[4400., 4730.], [4532., 4874.], [4664., 5018.], [4796., 5162.], [4928., 5306.]]) >>> a = torch.randn(3, 4, 5, device='cuda') >>> b = torch.randn(4, 5, 6, device='cuda') >>> c = torch.tensordot(a, b, dims=2).cpu() tensor([[ 8.3504, -2.5436, 6.2922, 2.7556, -1.0732, 3.2741], [ 3.3161, 0.0704, 5.0187, -0.4079, -4.3126, 4.8744], [ 0.8223, 3.9445, 3.2168, -0.2400, 3.4117, 1.7780]]) """ if not torch.jit.is_scripting(): if (type(a) is not Tensor or type(b) is not Tensor) and has_torch_function((a, b)): return handle_torch_function(tensordot, (a, b), a, b, dims=dims) if isinstance(dims, (list, tuple)) or \ (isinstance(dims, torch.Tensor) and dims.numel() > 1): dims_a, dims_b = dims else: if isinstance(dims, torch.Tensor): dims = dims.item() if dims < 0: raise RuntimeError("tensordot expects dims >= 0, but got dims={}".format(dims)) dims_a = list(range(-dims, 0)) dims_b = list(range(dims)) return _VF.tensordot(a, b, dims_a, dims_b) def cartesian_prod(*tensors): """Do cartesian product of the given sequence of tensors. The behavior is similar to python's `itertools.product`. Arguments: *tensors: any number of 1 dimensional tensors. Returns: Tensor: A tensor equivalent to converting all the input tensors into lists, do `itertools.product` on these lists, and finally convert the resulting list into tensor. Example:: >>> a = [1, 2, 3] >>> b = [4, 5] >>> list(itertools.product(a, b)) [(1, 4), (1, 5), (2, 4), (2, 5), (3, 4), (3, 5)] >>> tensor_a = torch.tensor(a) >>> tensor_b = torch.tensor(b) >>> torch.cartesian_prod(tensor_a, tensor_b) tensor([[1, 4], [1, 5], [2, 4], [2, 5], [3, 4], [3, 5]]) """ if not torch.jit.is_scripting(): if any(type(t) is not Tensor for t in tensors) and has_torch_function(tensors): return handle_torch_function(cartesian_prod, tensors, *tensors) return _VF.cartesian_prod(tensors) def block_diag(*tensors): """Create a block diagonal matrix from provided tensors. Arguments: *tensors: One or more tensors with 0, 1, or 2 dimensions. Returns: Tensor: A 2 dimensional tensor with all the input tensors arranged in order such that their upper left and lower right corners are diagonally adjacent. All other elements are set to 0. Example:: >>> import torch >>> A = torch.tensor([[0, 1], [1, 0]]) >>> B = torch.tensor([[3, 4, 5], [6, 7, 8]]) >>> C = torch.tensor(7) >>> D = torch.tensor([1, 2, 3]) >>> E = torch.tensor([[4], [5], [6]]) >>> torch.block_diag(A, B, C, D, E) tensor([[0, 1, 0, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 3, 4, 5, 0, 0, 0, 0, 0], [0, 0, 6, 7, 8, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 7, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 1, 2, 3, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 4], [0, 0, 0, 0, 0, 0, 0, 0, 0, 5], [0, 0, 0, 0, 0, 0, 0, 0, 0, 6]]) """ if any(type(t) is not Tensor for t in tensors) and has_torch_function(tensors): return handle_torch_function(block_diag, tensors, *tensors) return torch._C._VariableFunctions.block_diag(tensors) def cdist(x1, x2, p=2., compute_mode='use_mm_for_euclid_dist_if_necessary'): # type: (Tensor, Tensor, float, str) -> (Tensor) r"""Computes batched the p-norm distance between each pair of the two collections of row vectors. Args: x1 (Tensor): input tensor of shape :math:`B \times P \times M`. x2 (Tensor): input tensor of shape :math:`B \times R \times M`. p: p value for the p-norm distance to calculate between each vector pair :math:`\in [0, \infty]`. compute_mode: 'use_mm_for_euclid_dist_if_necessary' - will use matrix multiplication approach to calculate euclidean distance (p = 2) if P > 25 or R > 25 'use_mm_for_euclid_dist' - will always use matrix multiplication approach to calculate euclidean distance (p = 2) 'donot_use_mm_for_euclid_dist' - will never use matrix multiplication approach to calculate euclidean distance (p = 2) Default: use_mm_for_euclid_dist_if_necessary. If x1 has shape :math:`B \times P \times M` and x2 has shape :math:`B \times R \times M` then the output will have shape :math:`B \times P \times R`. This function is equivalent to `scipy.spatial.distance.cdist(input,'minkowski', p=p)` if :math:`p \in (0, \infty)`. When :math:`p = 0` it is equivalent to `scipy.spatial.distance.cdist(input, 'hamming') * M`. When :math:`p = \infty`, the closest scipy function is `scipy.spatial.distance.cdist(xn, lambda x, y: np.abs(x - y).max())`. Example: >>> a = torch.tensor([[0.9041, 0.0196], [-0.3108, -2.4423], [-0.4821, 1.059]]) >>> a tensor([[ 0.9041, 0.0196], [-0.3108, -2.4423], [-0.4821, 1.0590]]) >>> b = torch.tensor([[-2.1763, -0.4713], [-0.6986, 1.3702]]) >>> b tensor([[-2.1763, -0.4713], [-0.6986, 1.3702]]) >>> torch.cdist(a, b, p=2) tensor([[3.1193, 2.0959], [2.7138, 3.8322], [2.2830, 0.3791]]) """ if not torch.jit.is_scripting(): if (type(x1) is not Tensor or type(x2) is not Tensor) and has_torch_function((x1, x2)): return handle_torch_function( cdist, (x1, x2), x1, x2, p=p, compute_mode=compute_mode) if compute_mode == 'use_mm_for_euclid_dist_if_necessary': return _VF.cdist(x1, x2, p, None) elif compute_mode == 'use_mm_for_euclid_dist': return _VF.cdist(x1, x2, p, 1) elif compute_mode == 'donot_use_mm_for_euclid_dist': return _VF.cdist(x1, x2, p, 2) else: raise ValueError("{} is not a valid value for compute_mode".format(compute_mode)) # TODO: type dim as BroadcastingList when https://github.com/pytorch/pytorch/issues/33782 is fixed @overload # noqa: 749 def norm(input, p="fro", dim=None, keepdim=False, out=None, dtype=None): # noqa: 749 # type: (Tensor, str, Optional[List[int]], bool, Optional[Tensor], Optional[int]) -> Tensor pass @overload # noqa: 749 def norm(input, p="fro", dim=None, keepdim=False, out=None, dtype=None): # noqa: 749 # type: (Tensor, Optional[number], Optional[List[int]], bool, Optional[Tensor], Optional[int]) -> Tensor pass @overload # noqa: 749 def norm(input, p="fro", dim=None, keepdim=False, out=None, dtype=None): # noqa: 749 # type: (Tensor, Optional[number], Optional[int], bool, Optional[Tensor], Optional[int]) -> Tensor pass @overload # noqa: 749 def norm(input, p="fro", dim=None, keepdim=False, out=None, dtype=None): # noqa: 749 # type: (Tensor, str, Optional[int], bool, Optional[Tensor], Optional[int]) -> Tensor pass def norm(input, p="fro", dim=None, keepdim=False, out=None, dtype=None): # noqa: 749 r"""Returns the matrix norm or vector norm of a given tensor. Args: input (Tensor): the input tensor p (int, float, inf, -inf, 'fro', 'nuc', optional): the order of norm. Default: ``'fro'`` The following norms can be calculated: ===== ============================ ========================== ord matrix norm vector norm ===== ============================ ========================== None Frobenius norm 2-norm 'fro' Frobenius norm -- 'nuc' nuclear norm -- Other as vec norm when dim is None sum(abs(x)**ord)**(1./ord) ===== ============================ ========================== dim (int, 2-tuple of ints, 2-list of ints, optional): If it is an int, vector norm will be calculated, if it is 2-tuple of ints, matrix norm will be calculated. If the value is None, matrix norm will be calculated when the input tensor only has two dimensions, vector norm will be calculated when the input tensor only has one dimension. If the input tensor has more than two dimensions, the vector norm will be applied to last dimension. keepdim (bool, optional): whether the output tensors have :attr:`dim` retained or not. Ignored if :attr:`dim` = ``None`` and :attr:`out` = ``None``. Default: ``False`` out (Tensor, optional): the output tensor. Ignored if :attr:`dim` = ``None`` and :attr:`out` = ``None``. dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor. If specified, the input tensor is casted to :attr:'dtype' while performing the operation. Default: None. Example:: >>> import torch >>> a = torch.arange(9, dtype= torch.float) - 4 >>> b = a.reshape((3, 3)) >>> torch.norm(a) tensor(7.7460) >>> torch.norm(b) tensor(7.7460) >>> torch.norm(a, float('inf')) tensor(4.) >>> torch.norm(b, float('inf')) tensor(4.) >>> c = torch.tensor([[ 1, 2, 3],[-1, 1, 4]] , dtype= torch.float) >>> torch.norm(c, dim=0) tensor([1.4142, 2.2361, 5.0000]) >>> torch.norm(c, dim=1) tensor([3.7417, 4.2426]) >>> torch.norm(c, p=1, dim=1) tensor([6., 6.]) >>> d = torch.arange(8, dtype= torch.float).reshape(2,2,2) >>> torch.norm(d, dim=(1,2)) tensor([ 3.7417, 11.2250]) >>> torch.norm(d[0, :, :]), torch.norm(d[1, :, :]) (tensor(3.7417), tensor(11.2250)) """ if not torch.jit.is_scripting(): if type(input) is not Tensor and has_torch_function((input,)): return handle_torch_function( norm, (input,), input, p=p, dim=dim, keepdim=keepdim, out=out, dtype=dtype) ndim = input.dim() # catch default case if dim is None and out is None and dtype is None and p is not None: if isinstance(p, str): if p == "fro": return _VF.frobenius_norm(input) if not isinstance(p, str): return _VF.norm(input, p) # TODO: when https://github.com/pytorch/pytorch/issues/33782 is fixed # remove the overloads where dim is an int and replace with BraodcastingList1 # and remove next four lines, replace _dim with dim if dim is not None: if isinstance(dim, int): _dim = [dim] else: _dim = dim else: _dim = None if isinstance(p, str): if p == "fro": if dtype is not None: raise ValueError("dtype argument is not supported in frobenius norm") if _dim is None: _dim = [i for i in range(ndim)] # noqa: C416 TODO: rewrite as list(range(m)) if out is None: return _VF.frobenius_norm(input, _dim, keepdim=keepdim) else: return _VF.frobenius_norm(input, _dim, keepdim=keepdim, out=out) elif p == "nuc": if dtype is not None: raise ValueError("dtype argument is not supported in nuclear norm") if _dim is None: if out is None: return _VF.nuclear_norm(input, keepdim=keepdim) else: return _VF.nuclear_norm(input, keepdim=keepdim, out=out) else: if out is None: return _VF.nuclear_norm(input, _dim, keepdim=keepdim) else: return _VF.nuclear_norm(input, _dim, keepdim=keepdim, out=out) raise RuntimeError("only valid string values are 'fro' and 'nuc', found {}".format(p)) else: if _dim is None: _dim = [i for i in range(ndim)] # noqa: C416 TODO: rewrite as list(range(m)) if out is None: if dtype is None: return _VF.norm(input, p, _dim, keepdim=keepdim) else: return _VF.norm(input, p, _dim, keepdim=keepdim, dtype=dtype) else: if dtype is None: return _VF.norm(input, p, _dim, keepdim=keepdim, out=out) else: return _VF.norm(input, p, _dim, keepdim=keepdim, dtype=dtype, out=out) def chain_matmul(*matrices): r"""Returns the matrix product of the :math:`N` 2-D tensors. This product is efficiently computed using the matrix chain order algorithm which selects the order in which incurs the lowest cost in terms of arithmetic operations (`[CLRS]`_). Note that since this is a function to compute the product, :math:`N` needs to be greater than or equal to 2; if equal to 2 then a trivial matrix-matrix product is returned. If :math:`N` is 1, then this is a no-op - the original matrix is returned as is. Args: matrices (Tensors...): a sequence of 2 or more 2-D tensors whose product is to be determined. Returns: Tensor: if the :math:`i^{th}` tensor was of dimensions :math:`p_{i} \times p_{i + 1}`, then the product would be of dimensions :math:`p_{1} \times p_{N + 1}`. Example:: >>> a = torch.randn(3, 4) >>> b = torch.randn(4, 5) >>> c = torch.randn(5, 6) >>> d = torch.randn(6, 7) >>> torch.chain_matmul(a, b, c, d) tensor([[ -2.3375, -3.9790, -4.1119, -6.6577, 9.5609, -11.5095, -3.2614], [ 21.4038, 3.3378, -8.4982, -5.2457, -10.2561, -2.4684, 2.7163], [ -0.9647, -5.8917, -2.3213, -5.2284, 12.8615, -12.2816, -2.5095]]) .. _`[CLRS]`: https://mitpress.mit.edu/books/introduction-algorithms-third-edition """ if not torch.jit.is_scripting(): if any(type(t) is not Tensor for t in matrices) and has_torch_function(matrices): return handle_torch_function(chain_matmul, matrices, *matrices) return _VF.chain_matmul(matrices) def _lu_impl(A, pivot=True, get_infos=False, out=None): # type: (Tensor, bool, bool, Any) -> Tuple[Tensor, Tensor, Tensor] r"""Computes the LU factorization of a matrix or batches of matrices :attr:`A`. Returns a tuple containing the LU factorization and pivots of :attr:`A`. Pivoting is done if :attr:`pivot` is set to ``True``. .. note:: The pivots returned by the function are 1-indexed. If :attr:`pivot` is ``False``, then the returned pivots is a tensor filled with zeros of the appropriate size. .. note:: LU factorization with :attr:`pivot` = ``False`` is not available for CPU, and attempting to do so will throw an error. However, LU factorization with :attr:`pivot` = ``False`` is available for CUDA. .. note:: This function does not check if the factorization was successful or not if :attr:`get_infos` is ``True`` since the status of the factorization is present in the third element of the return tuple. .. note:: In the case of batches of square matrices with size less or equal to 32 on a CUDA device, the LU factorization is repeated for singular matrices due to the bug in the MAGMA library (see magma issue 13). .. note:: ``L``, ``U``, and ``P`` can be derived using :func:`torch.lu_unpack`. Arguments: A (Tensor): the tensor to factor of size :math:`(*, m, n)` pivot (bool, optional): controls whether pivoting is done. Default: ``True`` get_infos (bool, optional): if set to ``True``, returns an info IntTensor. Default: ``False`` out (tuple, optional): optional output tuple. If :attr:`get_infos` is ``True``, then the elements in the tuple are Tensor, IntTensor, and IntTensor. If :attr:`get_infos` is ``False``, then the elements in the tuple are Tensor, IntTensor. Default: ``None`` Returns: (Tensor, IntTensor, IntTensor (optional)): A tuple of tensors containing - **factorization** (*Tensor*): the factorization of size :math:`(*, m, n)` - **pivots** (*IntTensor*): the pivots of size :math:`(*, m)` - **infos** (*IntTensor*, *optional*): if :attr:`get_infos` is ``True``, this is a tensor of size :math:`(*)` where non-zero values indicate whether factorization for the matrix or each minibatch has succeeded or failed Example:: >>> A = torch.randn(2, 3, 3) >>> A_LU, pivots = torch.lu(A) >>> A_LU tensor([[[ 1.3506, 2.5558, -0.0816], [ 0.1684, 1.1551, 0.1940], [ 0.1193, 0.6189, -0.5497]], [[ 0.4526, 1.2526, -0.3285], [-0.7988, 0.7175, -0.9701], [ 0.2634, -0.9255, -0.3459]]]) >>> pivots tensor([[ 3, 3, 3], [ 3, 3, 3]], dtype=torch.int32) >>> A_LU, pivots, info = torch.lu(A, get_infos=True) >>> if info.nonzero().size(0) == 0: ... print('LU factorization succeeded for all samples!') LU factorization succeeded for all samples! """ # If get_infos is True, then we don't need to check for errors and vice versa return torch._lu_with_info(A, pivot=pivot, check_errors=(not get_infos)) def _check_list_size(out_len, get_infos, out): # type: (int, bool, List[Tensor]) -> None get_infos_int = 1 if get_infos else 0 if out_len - get_infos_int != 2: raise TypeError("expected tuple of {} elements but got {}" .format(2 + int(get_infos), len(out_len))) if not isinstance(out, (tuple, list)): raise TypeError("argument 'out' must be tuple of Tensors, not {}" .format(type(out).__name__)) def _lu_with_infos(A, pivot=True, get_infos=False, out=None): # type: (Tensor, bool, bool, Optional[Tuple[Tensor, Tensor, Tensor]]) -> Tuple[Tensor, Tensor, Tensor] if not torch.jit.is_scripting(): if type(A) is not Tensor and has_torch_function((A,)): return handle_torch_function( lu, (A,), A, pivot=pivot, get_infos=get_infos, out=out) result = _lu_impl(A, pivot, get_infos, out) if out is not None: _check_list_size(len(out), get_infos, out) for i in range(len(out)): out[i].resize_as_(result[i]).copy_(result[i]) return out else: return result # A_LU, pivots, infos def _lu_no_infos(A, pivot=True, get_infos=False, out=None): # type: (Tensor, bool, bool, Optional[Tuple[Tensor, Tensor]]) -> Tuple[Tensor, Tensor] # need to check for torch_function here so that we exit if if not torch.jit.is_scripting(): if type(A) is not Tensor and has_torch_function((A,)): return handle_torch_function( lu, (A,), A, pivot=pivot, get_infos=get_infos, out=out) result = _lu_impl(A, pivot, get_infos, out) if out is not None: _check_list_size(len(out), get_infos, out) for i in range(len(out)): out[i].resize_as_(result[i]).copy_(result[i]) return out else: return result[0], result[1] # A_LU, pivots # The return type of lu depends on `get_infos`, so in order to resolve the output type # of lu in TorchScript we need to statically know the value of `get_infos` lu = boolean_dispatch( arg_name='get_infos', arg_index=2, default=False, if_true=_lu_with_infos, if_false=_lu_no_infos, module_name=__name__, func_name='lu') lu.__doc__ = _lu_impl.__doc__ def align_tensors(*tensors): raise RuntimeError('`align_tensors` not yet implemented.') ================================================ FILE: patches/pytorch/1.6.0/functional.py ================================================ from typing import Tuple, Optional import librosa # STFT patch for aarch64 import numpy as np import torch import torch.nn.functional as F from ._lowrank import svd_lowrank, pca_lowrank from ._overrides import has_torch_function, handle_torch_function from ._jit_internal import boolean_dispatch, List from ._jit_internal import _overload as overload Tensor = torch.Tensor from torch import _VF __all__ = [ 'align_tensors', 'broadcast_tensors', 'cartesian_prod', 'block_diag', 'cdist', 'chain_matmul', 'einsum', 'istft', 'lu', 'lu_unpack', 'norm', 'meshgrid', 'pca_lowrank', 'split', 'stft', 'svd_lowrank', 'tensordot', 'unique', 'unique_consecutive', ] def broadcast_tensors(*tensors): r"""broadcast_tensors(*tensors) -> List of Tensors Broadcasts the given tensors according to :ref:`broadcasting-semantics`. Args: *tensors: any number of tensors of the same type .. warning:: More than one element of a broadcasted tensor may refer to a single memory location. As a result, in-place operations (especially ones that are vectorized) may result in incorrect behavior. If you need to write to the tensors, please clone them first. Example:: >>> x = torch.arange(3).view(1, 3) >>> y = torch.arange(2).view(2, 1) >>> a, b = torch.broadcast_tensors(x, y) >>> a.size() torch.Size([2, 3]) >>> a tensor([[0, 1, 2], [0, 1, 2]]) """ if not torch.jit.is_scripting(): if any(type(t) is not Tensor for t in tensors) and has_torch_function(tensors): return handle_torch_function(broadcast_tensors, tensors, *tensors) return _VF.broadcast_tensors(tensors) def split(tensor, split_size_or_sections, dim=0): r"""Splits the tensor into chunks. Each chunk is a view of the original tensor. If :attr:`split_size_or_sections` is an integer type, then :attr:`tensor` will be split into equally sized chunks (if possible). Last chunk will be smaller if the tensor size along the given dimension :attr:`dim` is not divisible by :attr:`split_size`. If :attr:`split_size_or_sections` is a list, then :attr:`tensor` will be split into ``len(split_size_or_sections)`` chunks with sizes in :attr:`dim` according to :attr:`split_size_or_sections`. Arguments: tensor (Tensor): tensor to split. split_size_or_sections (int) or (list(int)): size of a single chunk or list of sizes for each chunk dim (int): dimension along which to split the tensor. Example:: >>> a = torch.arange(10).reshape(5,2) >>> a tensor([[0, 1], [2, 3], [4, 5], [6, 7], [8, 9]]) >>> torch.split(a, 2) (tensor([[0, 1], [2, 3]]), tensor([[4, 5], [6, 7]]), tensor([[8, 9]])) >>> torch.split(a, [1,4]) (tensor([[0, 1]]), tensor([[2, 3], [4, 5], [6, 7], [8, 9]])) """ if not torch.jit.is_scripting(): if type(tensor) is not Tensor and has_torch_function((tensor,)): return handle_torch_function(split, (tensor,), tensor, split_size_or_sections, dim=dim) # Overwriting reason: # This dispatches to two ATen functions depending on the type of # split_size_or_sections. The branching code is in tensor.py, which we # call here. return tensor.split(split_size_or_sections, dim) # equivalent to itertools.product(indices) def _indices_product(indices): # type: (List[int]) -> (List[List[int]]) empty_list = torch.jit.annotate(List[int], []) result = [empty_list] for idx in indices: result_temp = torch.jit.annotate(List[List[int]], []) for res in result: for i in range(idx): result_temp.append(res + [i]) result = result_temp return result def _index_tensor_with_indices_list(tensor, indices): # type: (Tensor, List[int]) -> Tensor out = tensor for index in indices: out = out[index] return out def lu_unpack(LU_data, LU_pivots, unpack_data=True, unpack_pivots=True): # type: (Tensor, Tensor, bool, bool) -> (Tuple[Optional[Tensor], Optional[Tensor], Optional[Tensor]]) r"""Unpacks the data and pivots from a LU factorization of a tensor. Returns a tuple of tensors as ``(the pivots, the L tensor, the U tensor)``. Arguments: LU_data (Tensor): the packed LU factorization data LU_pivots (Tensor): the packed LU factorization pivots unpack_data (bool): flag indicating if the data should be unpacked unpack_pivots (bool): flag indicating if the pivots should be unpacked Examples:: >>> A = torch.randn(2, 3, 3) >>> A_LU, pivots = A.lu() >>> P, A_L, A_U = torch.lu_unpack(A_LU, pivots) >>> >>> # can recover A from factorization >>> A_ = torch.bmm(P, torch.bmm(A_L, A_U)) >>> # LU factorization of a rectangular matrix: >>> A = torch.randn(2, 3, 2) >>> A_LU, pivots = A.lu() >>> P, A_L, A_U = torch.lu_unpack(A_LU, pivots) >>> P tensor([[[1., 0., 0.], [0., 1., 0.], [0., 0., 1.]], [[0., 0., 1.], [0., 1., 0.], [1., 0., 0.]]]) >>> A_L tensor([[[ 1.0000, 0.0000], [ 0.4763, 1.0000], [ 0.3683, 0.1135]], [[ 1.0000, 0.0000], [ 0.2957, 1.0000], [-0.9668, -0.3335]]]) >>> A_U tensor([[[ 2.1962, 1.0881], [ 0.0000, -0.8681]], [[-1.0947, 0.3736], [ 0.0000, 0.5718]]]) >>> A_ = torch.bmm(P, torch.bmm(A_L, A_U)) >>> torch.norm(A_ - A) tensor(2.9802e-08) """ if not torch.jit.is_scripting(): tens_ops = (LU_data, LU_pivots) if any([type(t) is not Tensor for t in tens_ops]) and has_torch_function(tens_ops): return handle_torch_function( lu_unpack, tens_ops, LU_data, LU_pivots, unpack_data=unpack_data, unpack_pivots=unpack_pivots) shape = LU_data.shape # In generalized LU factorization, the following shape relations hold: # A.shape[-2:] == (m, n) # P.shape[-2:] == (m, m) # L.shape[-2:] == (m, k) # U.shape[-2:] == (k, n) # where k = min(m, n) m, n = shape[-2:] k = min(m, n) if unpack_data: U = LU_data.triu() if m != k: U = U.narrow(-2, 0, k) L = LU_data.tril() if k != n: L = L.narrow(-1, 0, k) L.diagonal(dim1=-2, dim2=-1).fill_(1) else: L = U = None if unpack_pivots: LU_pivots_zero_idx = LU_pivots - 1 if LU_data.dim() > 2: P = torch.eye(m, device=LU_data.device, dtype=LU_data.dtype) \ .expand(shape[:-1] + (m,)) \ .clone(memory_format=torch.contiguous_format) # TODO: rewrite when TorchScript supports product and map as # product(*map(lambda x: list(range(x)), shape[:-2])) when issue 33781 is fixed indices = _indices_product(shape[:-2]) for idx in indices: final_order = [i for i in range(m)] # noqa: C416 TODO: rewrite as list(range(m)) for k, j in enumerate(_index_tensor_with_indices_list(LU_pivots_zero_idx, idx)): final_order[k], final_order[j] = final_order[j], final_order[k] # TODO: remove _index_tensor_with_indices_list when TorchScript supports indexing Tensor with list p_idx = _index_tensor_with_indices_list(P, idx) p_idx.copy_(p_idx.index_select(1, torch.as_tensor(final_order, device=LU_pivots.device))) else: P = torch.eye(m, device=LU_data.device, dtype=LU_data.dtype) final_order = [i for i in range(m)] # noqa: C416 TODO: rewrite as list(range(m)) for k, j, in enumerate(LU_pivots_zero_idx): final_order[k], final_order[j] = final_order[j], final_order[k] P = P.index_select(1, torch.as_tensor(final_order, device=LU_pivots.device)) else: P = None return P, L, U def einsum(equation, *operands): r"""einsum(equation, *operands) -> Tensor This function provides a way of computing multilinear expressions (i.e. sums of products) using the Einstein summation convention. Args: equation (string): The equation is given in terms of lower case letters (indices) to be associated with each dimension of the operands and result. The left hand side lists the operands dimensions, separated by commas. There should be one index letter per tensor dimension. The right hand side follows after `->` and gives the indices for the output. If the `->` and right hand side are omitted, it implicitly defined as the alphabetically sorted list of all indices appearing exactly once in the left hand side. The indices not apprearing in the output are summed over after multiplying the operands entries. If an index appears several times for the same operand, a diagonal is taken. Ellipses `...` represent a fixed number of dimensions. If the right hand side is inferred, the ellipsis dimensions are at the beginning of the output. operands (Tensor): The operands to compute the Einstein sum of. .. note:: This function does not optimize the given expression, so a different formula for the same computation may run faster or consume less memory. Projects like opt_einsum (https://optimized-einsum.readthedocs.io/en/stable/) can optimize the formula for you. Examples:: >>> x = torch.randn(5) >>> y = torch.randn(4) >>> torch.einsum('i,j->ij', x, y) # outer product tensor([[-0.0570, -0.0286, -0.0231, 0.0197], [ 1.2616, 0.6335, 0.5113, -0.4351], [ 1.4452, 0.7257, 0.5857, -0.4984], [-0.4647, -0.2333, -0.1883, 0.1603], [-1.1130, -0.5588, -0.4510, 0.3838]]) >>> A = torch.randn(3,5,4) >>> l = torch.randn(2,5) >>> r = torch.randn(2,4) >>> torch.einsum('bn,anm,bm->ba', l, A, r) # compare torch.nn.functional.bilinear tensor([[-0.3430, -5.2405, 0.4494], [ 0.3311, 5.5201, -3.0356]]) >>> As = torch.randn(3,2,5) >>> Bs = torch.randn(3,5,4) >>> torch.einsum('bij,bjk->bik', As, Bs) # batch matrix multiplication tensor([[[-1.0564, -1.5904, 3.2023, 3.1271], [-1.6706, -0.8097, -0.8025, -2.1183]], [[ 4.2239, 0.3107, -0.5756, -0.2354], [-1.4558, -0.3460, 1.5087, -0.8530]], [[ 2.8153, 1.8787, -4.3839, -1.2112], [ 0.3728, -2.1131, 0.0921, 0.8305]]]) >>> A = torch.randn(3, 3) >>> torch.einsum('ii->i', A) # diagonal tensor([-0.7825, 0.8291, -0.1936]) >>> A = torch.randn(4, 3, 3) >>> torch.einsum('...ii->...i', A) # batch diagonal tensor([[-1.0864, 0.7292, 0.0569], [-0.9725, -1.0270, 0.6493], [ 0.5832, -1.1716, -1.5084], [ 0.4041, -1.1690, 0.8570]]) >>> A = torch.randn(2, 3, 4, 5) >>> torch.einsum('...ij->...ji', A).shape # batch permute torch.Size([2, 3, 5, 4]) """ if not torch.jit.is_scripting(): if any(type(t) is not Tensor for t in operands) and has_torch_function(operands): return handle_torch_function(einsum, operands, equation, *operands) if len(operands) == 1 and isinstance(operands[0], (list, tuple)): # the old interface of passing the operands as one list argument operands = operands[0] # recurse incase operands contains value that has torch function # in the original implementation this line is omitted return einsum(equation, *operands) return _VF.einsum(equation, operands) def meshgrid(*tensors): r"""Take :math:`N` tensors, each of which can be either scalar or 1-dimensional vector, and create :math:`N` N-dimensional grids, where the :math:`i` :sup:`th` grid is defined by expanding the :math:`i` :sup:`th` input over dimensions defined by other inputs. Args: tensors (list of Tensor): list of scalars or 1 dimensional tensors. Scalars will be treated as tensors of size :math:`(1,)` automatically Returns: seq (sequence of Tensors): If the input has :math:`k` tensors of size :math:`(N_1,), (N_2,), \ldots , (N_k,)`, then the output would also have :math:`k` tensors, where all tensors are of size :math:`(N_1, N_2, \ldots , N_k)`. Example:: >>> x = torch.tensor([1, 2, 3]) >>> y = torch.tensor([4, 5, 6]) >>> grid_x, grid_y = torch.meshgrid(x, y) >>> grid_x tensor([[1, 1, 1], [2, 2, 2], [3, 3, 3]]) >>> grid_y tensor([[4, 5, 6], [4, 5, 6], [4, 5, 6]]) """ if not torch.jit.is_scripting(): if any(type(t) is not Tensor for t in tensors) and has_torch_function(tensors): return handle_torch_function(meshgrid, tensors, *tensors) if len(tensors) == 1 and isinstance(tensors[0], (list, tuple)): # the old interface of passing the operands as one list argument tensors = tensors[0] return _VF.meshgrid(tensors) def stft(input, n_fft, hop_length=None, win_length=None, window=None, center=True, pad_mode='reflect', normalized=False, onesided=True): # type: (Tensor, int, Optional[int], Optional[int], Optional[Tensor], bool, str, bool, bool) -> Tensor r"""Short-time Fourier transform (STFT). Ignoring the optional batch dimension, this method computes the following expression: .. math:: X[m, \omega] = \sum_{k = 0}^{\text{win\_length-1}}% \text{window}[k]\ \text{input}[m \times \text{hop\_length} + k]\ % \exp\left(- j \frac{2 \pi \cdot \omega k}{\text{win\_length}}\right), where :math:`m` is the index of the sliding window, and :math:`\omega` is the frequency that :math:`0 \leq \omega < \text{n\_fft}`. When :attr:`onesided` is the default value ``True``, * :attr:`input` must be either a 1-D time sequence or a 2-D batch of time sequences. * If :attr:`hop_length` is ``None`` (default), it is treated as equal to ``floor(n_fft / 4)``. * If :attr:`win_length` is ``None`` (default), it is treated as equal to :attr:`n_fft`. * :attr:`window` can be a 1-D tensor of size :attr:`win_length`, e.g., from :meth:`torch.hann_window`. If :attr:`window` is ``None`` (default), it is treated as if having :math:`1` everywhere in the window. If :math:`\text{win\_length} < \text{n\_fft}`, :attr:`window` will be padded on both sides to length :attr:`n_fft` before being applied. * If :attr:`center` is ``True`` (default), :attr:`input` will be padded on both sides so that the :math:`t`-th frame is centered at time :math:`t \times \text{hop\_length}`. Otherwise, the :math:`t`-th frame begins at time :math:`t \times \text{hop\_length}`. * :attr:`pad_mode` determines the padding method used on :attr:`input` when :attr:`center` is ``True``. See :meth:`torch.nn.functional.pad` for all available options. Default is ``"reflect"``. * If :attr:`onesided` is ``True`` (default), only values for :math:`\omega` in :math:`\left[0, 1, 2, \dots, \left\lfloor \frac{\text{n\_fft}}{2} \right\rfloor + 1\right]` are returned because the real-to-complex Fourier transform satisfies the conjugate symmetry, i.e., :math:`X[m, \omega] = X[m, \text{n\_fft} - \omega]^*`. * If :attr:`normalized` is ``True`` (default is ``False``), the function returns the normalized STFT results, i.e., multiplied by :math:`(\text{frame\_length})^{-0.5}`. Returns the real and the imaginary parts together as one tensor of size :math:`(* \times N \times T \times 2)`, where :math:`*` is the optional batch size of :attr:`input`, :math:`N` is the number of frequencies where STFT is applied, :math:`T` is the total number of frames used, and each pair in the last dimension represents a complex number as the real part and the imaginary part. .. warning:: This function changed signature at version 0.4.1. Calling with the previous signature may cause error or return incorrect result. Arguments: input (Tensor): the input tensor n_fft (int): size of Fourier transform hop_length (int, optional): the distance between neighboring sliding window frames. Default: ``None`` (treated as equal to ``floor(n_fft / 4)``) win_length (int, optional): the size of window frame and STFT filter. Default: ``None`` (treated as equal to :attr:`n_fft`) window (Tensor, optional): the optional window function. Default: ``None`` (treated as window of all :math:`1` s) center (bool, optional): whether to pad :attr:`input` on both sides so that the :math:`t`-th frame is centered at time :math:`t \times \text{hop\_length}`. Default: ``True`` pad_mode (string, optional): controls the padding method used when :attr:`center` is ``True``. Default: ``"reflect"`` normalized (bool, optional): controls whether to return the normalized STFT results Default: ``False`` onesided (bool, optional): controls whether to return half of results to avoid redundancy Default: ``True`` Returns: Tensor: A tensor containing the STFT result with shape described above """ if not torch.jit.is_scripting(): if type(input) is not Tensor and has_torch_function((input,)): return handle_torch_function( stft, (input,), input, n_fft, hop_length=hop_length, win_length=win_length, window=window, center=center, pad_mode=pad_mode, normalized=normalized, onesided=onesided) # TODO: after having proper ways to map Python strings to ATen Enum, move # this and F.pad to ATen. if center: signal_dim = input.dim() extended_shape = [1] * (3 - signal_dim) + list(input.size()) pad = int(n_fft // 2) input = F.pad(input.view(extended_shape), (pad, pad), pad_mode) input = input.view(input.shape[-signal_dim:]) # STFT patch for aarch64 # https://stackoverflow.com/a/66872148 librosa_stft = librosa.stft(input.cpu().detach().numpy().reshape(-1), n_fft, hop_length, win_length, window="hann", center=center, pad_mode=pad_mode) librosa_stft = np.array([[a.real, a.imag] for a in librosa_stft]) librosa_stft = np.transpose(librosa_stft, axes=[0, 2, 1]) librosa_stft = np.expand_dims(librosa_stft, 0) librosa_stft = torch.from_numpy(librosa_stft) return librosa_stft #return _VF.stft(input, n_fft, hop_length, win_length, window, # type: ignore # normalized, onesided, return_complex) def istft(input, n_fft, hop_length=None, win_length=None, window=None, center=True, normalized=False, onesided=True, length=None): # type: (Tensor, int, Optional[int], Optional[int], Optional[Tensor], bool, bool, bool, Optional[int]) -> Tensor r"""Inverse short time Fourier Transform. This is expected to be the inverse of :func:`~torch.stft`. It has the same parameters (+ additional optional parameter of :attr:`length`) and it should return the least squares estimation of the original signal. The algorithm will check using the NOLA condition ( nonzero overlap). Important consideration in the parameters :attr:`window` and :attr:`center` so that the envelop created by the summation of all the windows is never zero at certain point in time. Specifically, :math:`\sum_{t=-\infty}^{\infty} w^2[n-t\times hop\_length] \cancel{=} 0`. Since :func:`~torch.stft` discards elements at the end of the signal if they do not fit in a frame, ``istft`` may return a shorter signal than the original signal (can occur if :attr:`center` is False since the signal isn't padded). If :attr:`center` is ``True``, then there will be padding e.g. ``'constant'``, ``'reflect'``, etc. Left padding can be trimmed off exactly because they can be calculated but right padding cannot be calculated without additional information. Example: Suppose the last window is: ``[17, 18, 0, 0, 0]`` vs ``[18, 0, 0, 0, 0]`` The :attr:`n_fft`, :attr:`hop_length`, :attr:`win_length` are all the same which prevents the calculation of right padding. These additional values could be zeros or a reflection of the signal so providing :attr:`length` could be useful. If :attr:`length` is ``None`` then padding will be aggressively removed (some loss of signal). [1] D. W. Griffin and J. S. Lim, "Signal estimation from modified short-time Fourier transform," IEEE Trans. ASSP, vol.32, no.2, pp.236-243, Apr. 1984. Arguments: input (Tensor): The input tensor. Expected to be output of :func:`~torch.stft`, either 3D (``fft_size``, ``n_frame``, 2) or 4D (``channel``, ``fft_size``, ``n_frame``, 2). n_fft (int): Size of Fourier transform hop_length (Optional[int]): The distance between neighboring sliding window frames. (Default: ``n_fft // 4``) win_length (Optional[int]): The size of window frame and STFT filter. (Default: ``n_fft``) window (Optional[torch.Tensor]): The optional window function. (Default: ``torch.ones(win_length)``) center (bool): Whether :attr:`input` was padded on both sides so that the :math:`t`-th frame is centered at time :math:`t \times \text{hop\_length}`. (Default: ``True``) normalized (bool): Whether the STFT was normalized. (Default: ``False``) onesided (bool): Whether the STFT is onesided. (Default: ``True``) length (Optional[int]): The amount to trim the signal by (i.e. the original signal length). (Default: whole signal) Returns: Tensor: Least squares estimation of the original signal of size (..., signal_length) """ if not torch.jit.is_scripting(): if type(input) is not Tensor and has_torch_function((input,)): return handle_torch_function( istft, (input,), input, n_fft, hop_length=hop_length, win_length=win_length, window=window, center=center, normalized=normalized, onesided=onesided, length=length) return _VF.istft( input, n_fft, hop_length, win_length, window, center, normalized, onesided, length) del torch.unique_dim def _unique_impl(input, sorted=True, return_inverse=False, return_counts=False, dim=None): # type: (Tensor, bool, bool, bool, Optional[int]) -> Tuple[Tensor, Tensor, Tensor] r"""Returns the unique elements of the input tensor. .. note:: This function is different from :func:`torch.unique_consecutive` in the sense that this function also eliminates non-consecutive duplicate values. .. note:: Currently in the CUDA implementation and the CPU implementation when dim is specified, `torch.unique` always sort the tensor at the beginning regardless of the `sort` argument. Sorting could be slow, so if your input tensor is already sorted, it is recommended to use :func:`torch.unique_consecutive` which avoids the sorting. Arguments: input (Tensor): the input tensor sorted (bool): Whether to sort the unique elements in ascending order before returning as output. return_inverse (bool): Whether to also return the indices for where elements in the original input ended up in the returned unique list. return_counts (bool): Whether to also return the counts for each unique element. dim (int): the dimension to apply unique. If ``None``, the unique of the flattened input is returned. default: ``None`` Returns: (Tensor, Tensor (optional), Tensor (optional)): A tensor or a tuple of tensors containing - **output** (*Tensor*): the output list of unique scalar elements. - **inverse_indices** (*Tensor*): (optional) if :attr:`return_inverse` is True, there will be an additional returned tensor (same shape as input) representing the indices for where elements in the original input map to in the output; otherwise, this function will only return a single tensor. - **counts** (*Tensor*): (optional) if :attr:`return_counts` is True, there will be an additional returned tensor (same shape as output or output.size(dim), if dim was specified) representing the number of occurrences for each unique value or tensor. Example:: >>> output = torch.unique(torch.tensor([1, 3, 2, 3], dtype=torch.long)) >>> output tensor([ 2, 3, 1]) >>> output, inverse_indices = torch.unique( torch.tensor([1, 3, 2, 3], dtype=torch.long), sorted=True, return_inverse=True) >>> output tensor([ 1, 2, 3]) >>> inverse_indices tensor([ 0, 2, 1, 2]) >>> output, inverse_indices = torch.unique( torch.tensor([[1, 3], [2, 3]], dtype=torch.long), sorted=True, return_inverse=True) >>> output tensor([ 1, 2, 3]) >>> inverse_indices tensor([[ 0, 2], [ 1, 2]]) """ if not torch.jit.is_scripting(): if type(input) is not Tensor and has_torch_function((input,)): return handle_torch_function( unique, (input,), input, sorted=sorted, return_inverse=return_inverse, return_counts=return_counts, dim=dim) if dim is not None: output, inverse_indices, counts = _VF.unique_dim( input, dim, sorted=sorted, return_inverse=return_inverse, return_counts=return_counts, ) else: output, inverse_indices, counts = torch._unique2( input, sorted=sorted, return_inverse=return_inverse, return_counts=return_counts, ) return output, inverse_indices, counts def _unique_consecutive_impl(input, return_inverse=False, return_counts=False, dim=None): # type: (Tensor, bool, bool, Optional[int]) -> Tuple[Tensor, Tensor, Tensor] r"""Eliminates all but the first element from every consecutive group of equivalent elements. .. note:: This function is different from :func:`torch.unique` in the sense that this function only eliminates consecutive duplicate values. This semantics is similar to `std::unique` in C++. Arguments: input (Tensor): the input tensor return_inverse (bool): Whether to also return the indices for where elements in the original input ended up in the returned unique list. return_counts (bool): Whether to also return the counts for each unique element. dim (int): the dimension to apply unique. If ``None``, the unique of the flattened input is returned. default: ``None`` Returns: (Tensor, Tensor (optional), Tensor (optional)): A tensor or a tuple of tensors containing - **output** (*Tensor*): the output list of unique scalar elements. - **inverse_indices** (*Tensor*): (optional) if :attr:`return_inverse` is True, there will be an additional returned tensor (same shape as input) representing the indices for where elements in the original input map to in the output; otherwise, this function will only return a single tensor. - **counts** (*Tensor*): (optional) if :attr:`return_counts` is True, there will be an additional returned tensor (same shape as output or output.size(dim), if dim was specified) representing the number of occurrences for each unique value or tensor. Example:: >>> x = torch.tensor([1, 1, 2, 2, 3, 1, 1, 2]) >>> output = torch.unique_consecutive(x) >>> output tensor([1, 2, 3, 1, 2]) >>> output, inverse_indices = torch.unique_consecutive(x, return_inverse=True) >>> output tensor([1, 2, 3, 1, 2]) >>> inverse_indices tensor([0, 0, 1, 1, 2, 3, 3, 4]) >>> output, counts = torch.unique_consecutive(x, return_counts=True) >>> output tensor([1, 2, 3, 1, 2]) >>> counts tensor([2, 2, 1, 2, 1]) """ if not torch.jit.is_scripting(): if type(input) is not Tensor and has_torch_function((input,)): return handle_torch_function( unique_consecutive, (input,), input, return_inverse=return_inverse, return_counts=return_counts, dim=dim) output, inverse_indices, counts = _VF.unique_consecutive( input, return_inverse=return_inverse, return_counts=return_counts, dim=dim) return output, inverse_indices, counts def _return_counts(input, sorted=True, return_inverse=False, return_counts=False, dim=None): # type: (Tensor, bool, bool, bool, Optional[int]) -> Tuple[Tensor, Tensor] if not torch.jit.is_scripting(): if type(input) is not Tensor and has_torch_function((input,)): return _unique_impl(input, sorted, return_inverse, return_counts, dim) output, _, counts = _unique_impl(input, sorted, return_inverse, return_counts, dim) return output, counts def _return_output(input, sorted=True, return_inverse=False, return_counts=False, dim=None): # type: (Tensor, bool, bool, bool, Optional[int]) -> Tensor if not torch.jit.is_scripting(): if type(input) is not Tensor and has_torch_function((input,)): return _unique_impl(input, sorted, return_inverse, return_counts, dim) output, _, _ = _unique_impl(input, sorted, return_inverse, return_counts, dim) return output def _return_inverse(input, sorted=True, return_inverse=False, return_counts=False, dim=None): # type: (Tensor, bool, bool, bool, Optional[int]) -> Tuple[Tensor, Tensor] if not torch.jit.is_scripting(): if type(input) is not Tensor and has_torch_function((input,)): return _unique_impl(input, sorted, return_inverse, return_counts, dim) output, inverse_indices, _ = _unique_impl(input, sorted, return_inverse, return_counts, dim) return output, inverse_indices _return_inverse_false = boolean_dispatch( arg_name='return_counts', arg_index=3, default=False, if_true=_return_counts, if_false=_return_output, module_name=__name__, func_name='unique') _return_inverse_true = boolean_dispatch( arg_name='return_counts', arg_index=3, default=False, if_true=_unique_impl, if_false=_return_inverse, module_name=__name__, func_name='unique') # The return type of unique depends on `return_inverse`, and `return_counts` so in order to # resolve the output type in TorchScript we need to statically know the value of both parameters unique = boolean_dispatch( arg_name='return_inverse', arg_index=2, default=False, if_true=_return_inverse_true, if_false=_return_inverse_false, module_name=__name__, func_name='unique') unique.__doc__ = _unique_impl.__doc__ def _consecutive_return_counts(input, return_inverse=False, return_counts=False, dim=None): # type: (Tensor, bool, bool, Optional[int]) -> Tuple[Tensor, Tensor] if not torch.jit.is_scripting(): if type(input) is not Tensor and has_torch_function((input,)): return _unique_consecutive_impl(input, return_inverse, return_counts, dim) output, _, counts = _unique_consecutive_impl(input, return_inverse, return_counts, dim) return output, counts def _consecutive_return_output(input, return_inverse=False, return_counts=False, dim=None): # type: (Tensor, bool, bool, Optional[int]) -> Tensor if not torch.jit.is_scripting(): if type(input) is not Tensor and has_torch_function((input,)): return _unique_consecutive_impl(input, return_inverse, return_counts, dim) output, _, _ = _unique_consecutive_impl(input, return_inverse, return_counts, dim) return output def _consecutive_return_inverse(input, return_inverse=False, return_counts=False, dim=None): # type: (Tensor, bool, bool, Optional[int]) -> Tuple[Tensor, Tensor] if not torch.jit.is_scripting(): if type(input) is not Tensor and has_torch_function((input,)): return _unique_consecutive_impl(input, return_inverse, return_counts, dim) output, inverse_indices, _ = _unique_consecutive_impl(input, return_inverse, return_counts, dim) return output, inverse_indices _consecutive_return_inverse_false = boolean_dispatch( arg_name='return_counts', arg_index=1, default=False, if_true=_consecutive_return_counts, if_false=_consecutive_return_output, module_name=__name__, func_name='unique_consecutive') _consecutive_return_inverse_true = boolean_dispatch( arg_name='return_counts', arg_index=1, default=False, if_true=_unique_consecutive_impl, if_false=_consecutive_return_inverse, module_name=__name__, func_name='unique_consecutive') # The return type of unique depends on `return_inverse`, and `return_counts` so in order to # resolve the output type in TorchScript we need to statically know the value of both parameters unique_consecutive = boolean_dispatch( arg_name='return_inverse', arg_index=2, default=False, if_true=_consecutive_return_inverse_true, if_false=_consecutive_return_inverse_false, module_name=__name__, func_name='unique_consecutive') unique_consecutive.__doc__ = _unique_consecutive_impl.__doc__ def tensordot(a, b, dims=2): r"""Returns a contraction of a and b over multiple dimensions. :attr:`tensordot` implements a generalized matrix product. Args: a (Tensor): Left tensor to contract b (Tensor): Right tensor to contract dims (int or tuple of two lists of integers): number of dimensions to contract or explicit lists of dimensions for :attr:`a` and :attr:`b` respectively When called with a non-negative integer argument :attr:`dims` = :math:`d`, and the number of dimensions of :attr:`a` and :attr:`b` is :math:`m` and :math:`n`, respectively, :func:`~torch.tensordot` computes .. math:: r_{i_0,...,i_{m-d}, i_d,...,i_n} = \sum_{k_0,...,k_{d-1}} a_{i_0,...,i_{m-d},k_0,...,k_{d-1}} \times b_{k_0,...,k_{d-1}, i_d,...,i_n}. When called with :attr:`dims` of the list form, the given dimensions will be contracted in place of the last :math:`d` of :attr:`a` and the first :math:`d` of :math:`b`. The sizes in these dimensions must match, but :func:`~torch.tensordot` will deal with broadcasted dimensions. Examples:: >>> a = torch.arange(60.).reshape(3, 4, 5) >>> b = torch.arange(24.).reshape(4, 3, 2) >>> torch.tensordot(a, b, dims=([1, 0], [0, 1])) tensor([[4400., 4730.], [4532., 4874.], [4664., 5018.], [4796., 5162.], [4928., 5306.]]) >>> a = torch.randn(3, 4, 5, device='cuda') >>> b = torch.randn(4, 5, 6, device='cuda') >>> c = torch.tensordot(a, b, dims=2).cpu() tensor([[ 8.3504, -2.5436, 6.2922, 2.7556, -1.0732, 3.2741], [ 3.3161, 0.0704, 5.0187, -0.4079, -4.3126, 4.8744], [ 0.8223, 3.9445, 3.2168, -0.2400, 3.4117, 1.7780]]) """ if not torch.jit.is_scripting(): if (type(a) is not Tensor or type(b) is not Tensor) and has_torch_function((a, b)): return handle_torch_function(tensordot, (a, b), a, b, dims=dims) if isinstance(dims, (list, tuple)) or \ (isinstance(dims, torch.Tensor) and dims.numel() > 1): dims_a, dims_b = dims else: if isinstance(dims, torch.Tensor): dims = dims.item() if dims < 0: raise RuntimeError("tensordot expects dims >= 0, but got dims={}".format(dims)) dims_a = list(range(-dims, 0)) dims_b = list(range(dims)) return _VF.tensordot(a, b, dims_a, dims_b) def cartesian_prod(*tensors): """Do cartesian product of the given sequence of tensors. The behavior is similar to python's `itertools.product`. Arguments: *tensors: any number of 1 dimensional tensors. Returns: Tensor: A tensor equivalent to converting all the input tensors into lists, do `itertools.product` on these lists, and finally convert the resulting list into tensor. Example:: >>> a = [1, 2, 3] >>> b = [4, 5] >>> list(itertools.product(a, b)) [(1, 4), (1, 5), (2, 4), (2, 5), (3, 4), (3, 5)] >>> tensor_a = torch.tensor(a) >>> tensor_b = torch.tensor(b) >>> torch.cartesian_prod(tensor_a, tensor_b) tensor([[1, 4], [1, 5], [2, 4], [2, 5], [3, 4], [3, 5]]) """ if not torch.jit.is_scripting(): if any(type(t) is not Tensor for t in tensors) and has_torch_function(tensors): return handle_torch_function(cartesian_prod, tensors, *tensors) return _VF.cartesian_prod(tensors) def block_diag(*tensors): """Create a block diagonal matrix from provided tensors. Arguments: *tensors: One or more tensors with 0, 1, or 2 dimensions. Returns: Tensor: A 2 dimensional tensor with all the input tensors arranged in order such that their upper left and lower right corners are diagonally adjacent. All other elements are set to 0. Example:: >>> import torch >>> A = torch.tensor([[0, 1], [1, 0]]) >>> B = torch.tensor([[3, 4, 5], [6, 7, 8]]) >>> C = torch.tensor(7) >>> D = torch.tensor([1, 2, 3]) >>> E = torch.tensor([[4], [5], [6]]) >>> torch.block_diag(A, B, C, D, E) tensor([[0, 1, 0, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 3, 4, 5, 0, 0, 0, 0, 0], [0, 0, 6, 7, 8, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 7, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 1, 2, 3, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 4], [0, 0, 0, 0, 0, 0, 0, 0, 0, 5], [0, 0, 0, 0, 0, 0, 0, 0, 0, 6]]) """ if any(type(t) is not Tensor for t in tensors) and has_torch_function(tensors): return handle_torch_function(block_diag, tensors, *tensors) return torch._C._VariableFunctions.block_diag(tensors) def cdist(x1, x2, p=2., compute_mode='use_mm_for_euclid_dist_if_necessary'): # type: (Tensor, Tensor, float, str) -> (Tensor) r"""Computes batched the p-norm distance between each pair of the two collections of row vectors. Args: x1 (Tensor): input tensor of shape :math:`B \times P \times M`. x2 (Tensor): input tensor of shape :math:`B \times R \times M`. p: p value for the p-norm distance to calculate between each vector pair :math:`\in [0, \infty]`. compute_mode: 'use_mm_for_euclid_dist_if_necessary' - will use matrix multiplication approach to calculate euclidean distance (p = 2) if P > 25 or R > 25 'use_mm_for_euclid_dist' - will always use matrix multiplication approach to calculate euclidean distance (p = 2) 'donot_use_mm_for_euclid_dist' - will never use matrix multiplication approach to calculate euclidean distance (p = 2) Default: use_mm_for_euclid_dist_if_necessary. If x1 has shape :math:`B \times P \times M` and x2 has shape :math:`B \times R \times M` then the output will have shape :math:`B \times P \times R`. This function is equivalent to `scipy.spatial.distance.cdist(input,'minkowski', p=p)` if :math:`p \in (0, \infty)`. When :math:`p = 0` it is equivalent to `scipy.spatial.distance.cdist(input, 'hamming') * M`. When :math:`p = \infty`, the closest scipy function is `scipy.spatial.distance.cdist(xn, lambda x, y: np.abs(x - y).max())`. Example: >>> a = torch.tensor([[0.9041, 0.0196], [-0.3108, -2.4423], [-0.4821, 1.059]]) >>> a tensor([[ 0.9041, 0.0196], [-0.3108, -2.4423], [-0.4821, 1.0590]]) >>> b = torch.tensor([[-2.1763, -0.4713], [-0.6986, 1.3702]]) >>> b tensor([[-2.1763, -0.4713], [-0.6986, 1.3702]]) >>> torch.cdist(a, b, p=2) tensor([[3.1193, 2.0959], [2.7138, 3.8322], [2.2830, 0.3791]]) """ if not torch.jit.is_scripting(): if (type(x1) is not Tensor or type(x2) is not Tensor) and has_torch_function((x1, x2)): return handle_torch_function( cdist, (x1, x2), x1, x2, p=p, compute_mode=compute_mode) if compute_mode == 'use_mm_for_euclid_dist_if_necessary': return _VF.cdist(x1, x2, p, None) elif compute_mode == 'use_mm_for_euclid_dist': return _VF.cdist(x1, x2, p, 1) elif compute_mode == 'donot_use_mm_for_euclid_dist': return _VF.cdist(x1, x2, p, 2) else: raise ValueError("{} is not a valid value for compute_mode".format(compute_mode)) # TODO: type dim as BroadcastingList when https://github.com/pytorch/pytorch/issues/33782 is fixed @overload # noqa: 749 def norm(input, p="fro", dim=None, keepdim=False, out=None, dtype=None): # noqa: 749 # type: (Tensor, str, Optional[List[int]], bool, Optional[Tensor], Optional[int]) -> Tensor pass @overload # noqa: 749 def norm(input, p="fro", dim=None, keepdim=False, out=None, dtype=None): # noqa: 749 # type: (Tensor, Optional[number], Optional[List[int]], bool, Optional[Tensor], Optional[int]) -> Tensor pass @overload # noqa: 749 def norm(input, p="fro", dim=None, keepdim=False, out=None, dtype=None): # noqa: 749 # type: (Tensor, Optional[number], Optional[int], bool, Optional[Tensor], Optional[int]) -> Tensor pass @overload # noqa: 749 def norm(input, p="fro", dim=None, keepdim=False, out=None, dtype=None): # noqa: 749 # type: (Tensor, str, Optional[int], bool, Optional[Tensor], Optional[int]) -> Tensor pass def norm(input, p="fro", dim=None, keepdim=False, out=None, dtype=None): # noqa: 749 r"""Returns the matrix norm or vector norm of a given tensor. Args: input (Tensor): the input tensor p (int, float, inf, -inf, 'fro', 'nuc', optional): the order of norm. Default: ``'fro'`` The following norms can be calculated: ===== ============================ ========================== ord matrix norm vector norm ===== ============================ ========================== None Frobenius norm 2-norm 'fro' Frobenius norm -- 'nuc' nuclear norm -- Other as vec norm when dim is None sum(abs(x)**ord)**(1./ord) ===== ============================ ========================== dim (int, 2-tuple of ints, 2-list of ints, optional): If it is an int, vector norm will be calculated, if it is 2-tuple of ints, matrix norm will be calculated. If the value is None, matrix norm will be calculated when the input tensor only has two dimensions, vector norm will be calculated when the input tensor only has one dimension. If the input tensor has more than two dimensions, the vector norm will be applied to last dimension. keepdim (bool, optional): whether the output tensors have :attr:`dim` retained or not. Ignored if :attr:`dim` = ``None`` and :attr:`out` = ``None``. Default: ``False`` out (Tensor, optional): the output tensor. Ignored if :attr:`dim` = ``None`` and :attr:`out` = ``None``. dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor. If specified, the input tensor is casted to :attr:'dtype' while performing the operation. Default: None. Example:: >>> import torch >>> a = torch.arange(9, dtype= torch.float) - 4 >>> b = a.reshape((3, 3)) >>> torch.norm(a) tensor(7.7460) >>> torch.norm(b) tensor(7.7460) >>> torch.norm(a, float('inf')) tensor(4.) >>> torch.norm(b, float('inf')) tensor(4.) >>> c = torch.tensor([[ 1, 2, 3],[-1, 1, 4]] , dtype= torch.float) >>> torch.norm(c, dim=0) tensor([1.4142, 2.2361, 5.0000]) >>> torch.norm(c, dim=1) tensor([3.7417, 4.2426]) >>> torch.norm(c, p=1, dim=1) tensor([6., 6.]) >>> d = torch.arange(8, dtype= torch.float).reshape(2,2,2) >>> torch.norm(d, dim=(1,2)) tensor([ 3.7417, 11.2250]) >>> torch.norm(d[0, :, :]), torch.norm(d[1, :, :]) (tensor(3.7417), tensor(11.2250)) """ if not torch.jit.is_scripting(): if type(input) is not Tensor and has_torch_function((input,)): return handle_torch_function( norm, (input,), input, p=p, dim=dim, keepdim=keepdim, out=out, dtype=dtype) ndim = input.dim() # catch default case if dim is None and out is None and dtype is None and p is not None: if isinstance(p, str): if p == "fro": return _VF.frobenius_norm(input) if not isinstance(p, str): return _VF.norm(input, p) # TODO: when https://github.com/pytorch/pytorch/issues/33782 is fixed # remove the overloads where dim is an int and replace with BraodcastingList1 # and remove next four lines, replace _dim with dim if dim is not None: if isinstance(dim, int): _dim = [dim] else: _dim = dim else: _dim = None if isinstance(p, str): if p == "fro": if dtype is not None: raise ValueError("dtype argument is not supported in frobenius norm") if _dim is None: _dim = [i for i in range(ndim)] # noqa: C416 TODO: rewrite as list(range(m)) if out is None: return _VF.frobenius_norm(input, _dim, keepdim=keepdim) else: return _VF.frobenius_norm(input, _dim, keepdim=keepdim, out=out) elif p == "nuc": if dtype is not None: raise ValueError("dtype argument is not supported in nuclear norm") if _dim is None: if out is None: return _VF.nuclear_norm(input, keepdim=keepdim) else: return _VF.nuclear_norm(input, keepdim=keepdim, out=out) else: if out is None: return _VF.nuclear_norm(input, _dim, keepdim=keepdim) else: return _VF.nuclear_norm(input, _dim, keepdim=keepdim, out=out) raise RuntimeError("only valid string values are 'fro' and 'nuc', found {}".format(p)) else: if _dim is None: _dim = [i for i in range(ndim)] # noqa: C416 TODO: rewrite as list(range(m)) if out is None: if dtype is None: return _VF.norm(input, p, _dim, keepdim=keepdim) else: return _VF.norm(input, p, _dim, keepdim=keepdim, dtype=dtype) else: if dtype is None: return _VF.norm(input, p, _dim, keepdim=keepdim, out=out) else: return _VF.norm(input, p, _dim, keepdim=keepdim, dtype=dtype, out=out) def chain_matmul(*matrices): r"""Returns the matrix product of the :math:`N` 2-D tensors. This product is efficiently computed using the matrix chain order algorithm which selects the order in which incurs the lowest cost in terms of arithmetic operations (`[CLRS]`_). Note that since this is a function to compute the product, :math:`N` needs to be greater than or equal to 2; if equal to 2 then a trivial matrix-matrix product is returned. If :math:`N` is 1, then this is a no-op - the original matrix is returned as is. Args: matrices (Tensors...): a sequence of 2 or more 2-D tensors whose product is to be determined. Returns: Tensor: if the :math:`i^{th}` tensor was of dimensions :math:`p_{i} \times p_{i + 1}`, then the product would be of dimensions :math:`p_{1} \times p_{N + 1}`. Example:: >>> a = torch.randn(3, 4) >>> b = torch.randn(4, 5) >>> c = torch.randn(5, 6) >>> d = torch.randn(6, 7) >>> torch.chain_matmul(a, b, c, d) tensor([[ -2.3375, -3.9790, -4.1119, -6.6577, 9.5609, -11.5095, -3.2614], [ 21.4038, 3.3378, -8.4982, -5.2457, -10.2561, -2.4684, 2.7163], [ -0.9647, -5.8917, -2.3213, -5.2284, 12.8615, -12.2816, -2.5095]]) .. _`[CLRS]`: https://mitpress.mit.edu/books/introduction-algorithms-third-edition """ if not torch.jit.is_scripting(): if any(type(t) is not Tensor for t in matrices) and has_torch_function(matrices): return handle_torch_function(chain_matmul, matrices, *matrices) return _VF.chain_matmul(matrices) def _lu_impl(A, pivot=True, get_infos=False, out=None): # type: (Tensor, bool, bool, Any) -> Tuple[Tensor, Tensor, Tensor] r"""Computes the LU factorization of a matrix or batches of matrices :attr:`A`. Returns a tuple containing the LU factorization and pivots of :attr:`A`. Pivoting is done if :attr:`pivot` is set to ``True``. .. note:: The pivots returned by the function are 1-indexed. If :attr:`pivot` is ``False``, then the returned pivots is a tensor filled with zeros of the appropriate size. .. note:: LU factorization with :attr:`pivot` = ``False`` is not available for CPU, and attempting to do so will throw an error. However, LU factorization with :attr:`pivot` = ``False`` is available for CUDA. .. note:: This function does not check if the factorization was successful or not if :attr:`get_infos` is ``True`` since the status of the factorization is present in the third element of the return tuple. .. note:: In the case of batches of square matrices with size less or equal to 32 on a CUDA device, the LU factorization is repeated for singular matrices due to the bug in the MAGMA library (see magma issue 13). .. note:: ``L``, ``U``, and ``P`` can be derived using :func:`torch.lu_unpack`. Arguments: A (Tensor): the tensor to factor of size :math:`(*, m, n)` pivot (bool, optional): controls whether pivoting is done. Default: ``True`` get_infos (bool, optional): if set to ``True``, returns an info IntTensor. Default: ``False`` out (tuple, optional): optional output tuple. If :attr:`get_infos` is ``True``, then the elements in the tuple are Tensor, IntTensor, and IntTensor. If :attr:`get_infos` is ``False``, then the elements in the tuple are Tensor, IntTensor. Default: ``None`` Returns: (Tensor, IntTensor, IntTensor (optional)): A tuple of tensors containing - **factorization** (*Tensor*): the factorization of size :math:`(*, m, n)` - **pivots** (*IntTensor*): the pivots of size :math:`(*, m)` - **infos** (*IntTensor*, *optional*): if :attr:`get_infos` is ``True``, this is a tensor of size :math:`(*)` where non-zero values indicate whether factorization for the matrix or each minibatch has succeeded or failed Example:: >>> A = torch.randn(2, 3, 3) >>> A_LU, pivots = torch.lu(A) >>> A_LU tensor([[[ 1.3506, 2.5558, -0.0816], [ 0.1684, 1.1551, 0.1940], [ 0.1193, 0.6189, -0.5497]], [[ 0.4526, 1.2526, -0.3285], [-0.7988, 0.7175, -0.9701], [ 0.2634, -0.9255, -0.3459]]]) >>> pivots tensor([[ 3, 3, 3], [ 3, 3, 3]], dtype=torch.int32) >>> A_LU, pivots, info = torch.lu(A, get_infos=True) >>> if info.nonzero().size(0) == 0: ... print('LU factorization succeeded for all samples!') LU factorization succeeded for all samples! """ # If get_infos is True, then we don't need to check for errors and vice versa return torch._lu_with_info(A, pivot=pivot, check_errors=(not get_infos)) def _check_list_size(out_len, get_infos, out): # type: (int, bool, List[Tensor]) -> None get_infos_int = 1 if get_infos else 0 if out_len - get_infos_int != 2: raise TypeError("expected tuple of {} elements but got {}" .format(2 + int(get_infos), len(out_len))) if not isinstance(out, (tuple, list)): raise TypeError("argument 'out' must be tuple of Tensors, not {}" .format(type(out).__name__)) def _lu_with_infos(A, pivot=True, get_infos=False, out=None): # type: (Tensor, bool, bool, Optional[Tuple[Tensor, Tensor, Tensor]]) -> Tuple[Tensor, Tensor, Tensor] if not torch.jit.is_scripting(): if type(A) is not Tensor and has_torch_function((A,)): return handle_torch_function( lu, (A,), A, pivot=pivot, get_infos=get_infos, out=out) result = _lu_impl(A, pivot, get_infos, out) if out is not None: _check_list_size(len(out), get_infos, out) for i in range(len(out)): out[i].resize_as_(result[i]).copy_(result[i]) return out else: return result # A_LU, pivots, infos def _lu_no_infos(A, pivot=True, get_infos=False, out=None): # type: (Tensor, bool, bool, Optional[Tuple[Tensor, Tensor]]) -> Tuple[Tensor, Tensor] # need to check for torch_function here so that we exit if if not torch.jit.is_scripting(): if type(A) is not Tensor and has_torch_function((A,)): return handle_torch_function( lu, (A,), A, pivot=pivot, get_infos=get_infos, out=out) result = _lu_impl(A, pivot, get_infos, out) if out is not None: _check_list_size(len(out), get_infos, out) for i in range(len(out)): out[i].resize_as_(result[i]).copy_(result[i]) return out else: return result[0], result[1] # A_LU, pivots # The return type of lu depends on `get_infos`, so in order to resolve the output type # of lu in TorchScript we need to statically know the value of `get_infos` lu = boolean_dispatch( arg_name='get_infos', arg_index=2, default=False, if_true=_lu_with_infos, if_false=_lu_no_infos, module_name=__name__, func_name='lu') lu.__doc__ = _lu_impl.__doc__ def align_tensors(*tensors): raise RuntimeError('`align_tensors` not yet implemented.') ================================================ FILE: patches/pytorch/1.7.0/functional.diff ================================================ 4a5,7 > import librosa # STFT patch for aarch64 > import numpy as np > 515,516c518,528 < return _VF.stft(input, n_fft, hop_length, win_length, window, # type: ignore < normalized, onesided, return_complex) --- > > # STFT patch for aarch64 > # https://stackoverflow.com/a/66872148 > librosa_stft = librosa.stft(input.cpu().detach().numpy().reshape(-1), n_fft, hop_length, win_length, window="hann", center=center, pad_mode=pad_mode) > librosa_stft = np.array([[a.real, a.imag] for a in librosa_stft]) > librosa_stft = np.transpose(librosa_stft, axes=[0, 2, 1]) > librosa_stft = np.expand_dims(librosa_stft, 0) > librosa_stft = torch.from_numpy(librosa_stft) > return librosa_stft > #return _VF.stft(input, n_fft, hop_length, win_length, window, # type: ignore > # normalized, onesided, return_complex) ================================================ FILE: patches/pytorch/1.7.0/functional.original.py ================================================ from typing import ( Tuple, Optional, Union, Any, Sequence, TYPE_CHECKING ) import torch import torch.nn.functional as F from torch.types import _size from ._lowrank import svd_lowrank, pca_lowrank from .overrides import has_torch_function, handle_torch_function from ._jit_internal import boolean_dispatch, List from ._jit_internal import _overload as overload Tensor = torch.Tensor from torch import _VF __all__ = [ 'atleast_1d', 'atleast_2d', 'atleast_3d', 'align_tensors', 'broadcast_tensors', 'cartesian_prod', 'block_diag', 'cdist', 'chain_matmul', 'einsum', 'istft', 'lu', 'lu_unpack', 'norm', 'meshgrid', 'pca_lowrank', 'split', 'stft', 'svd_lowrank', 'tensordot', 'unique', 'unique_consecutive', ] def broadcast_tensors(*tensors): r"""broadcast_tensors(*tensors) -> List of Tensors Broadcasts the given tensors according to :ref:`broadcasting-semantics`. Args: *tensors: any number of tensors of the same type .. warning:: More than one element of a broadcasted tensor may refer to a single memory location. As a result, in-place operations (especially ones that are vectorized) may result in incorrect behavior. If you need to write to the tensors, please clone them first. Example:: >>> x = torch.arange(3).view(1, 3) >>> y = torch.arange(2).view(2, 1) >>> a, b = torch.broadcast_tensors(x, y) >>> a.size() torch.Size([2, 3]) >>> a tensor([[0, 1, 2], [0, 1, 2]]) """ if not torch.jit.is_scripting(): if any(type(t) is not Tensor for t in tensors) and has_torch_function(tensors): return handle_torch_function(broadcast_tensors, tensors, *tensors) return _VF.broadcast_tensors(tensors) # type: ignore def split(tensor, split_size_or_sections, dim=0): r"""Splits the tensor into chunks. Each chunk is a view of the original tensor. If :attr:`split_size_or_sections` is an integer type, then :attr:`tensor` will be split into equally sized chunks (if possible). Last chunk will be smaller if the tensor size along the given dimension :attr:`dim` is not divisible by :attr:`split_size`. If :attr:`split_size_or_sections` is a list, then :attr:`tensor` will be split into ``len(split_size_or_sections)`` chunks with sizes in :attr:`dim` according to :attr:`split_size_or_sections`. Arguments: tensor (Tensor): tensor to split. split_size_or_sections (int) or (list(int)): size of a single chunk or list of sizes for each chunk dim (int): dimension along which to split the tensor. Example:: >>> a = torch.arange(10).reshape(5,2) >>> a tensor([[0, 1], [2, 3], [4, 5], [6, 7], [8, 9]]) >>> torch.split(a, 2) (tensor([[0, 1], [2, 3]]), tensor([[4, 5], [6, 7]]), tensor([[8, 9]])) >>> torch.split(a, [1,4]) (tensor([[0, 1]]), tensor([[2, 3], [4, 5], [6, 7], [8, 9]])) """ if not torch.jit.is_scripting(): if type(tensor) is not Tensor and has_torch_function((tensor,)): return handle_torch_function(split, (tensor,), tensor, split_size_or_sections, dim=dim) # Overwriting reason: # This dispatches to two ATen functions depending on the type of # split_size_or_sections. The branching code is in tensor.py, which we # call here. return tensor.split(split_size_or_sections, dim) if TYPE_CHECKING: _Indices = _size else: _Indices = List[int] # equivalent to itertools.product(indices) def _indices_product(indices: _Indices) -> List[List[int]]: empty_list = torch.jit.annotate(List[int], []) result = [empty_list] for idx in indices: result_temp = torch.jit.annotate(List[List[int]], []) for res in result: for i in range(idx): result_temp.append(res + [i]) result = result_temp return result def _index_tensor_with_indices_list(tensor, indices): # type: (Tensor, List[int]) -> Tensor out = tensor for index in indices: out = out[index] return out def lu_unpack(LU_data, LU_pivots, unpack_data=True, unpack_pivots=True): # type: (Tensor, Tensor, bool, bool) -> (Tuple[Optional[Tensor], Optional[Tensor], Optional[Tensor]]) r"""Unpacks the data and pivots from a LU factorization of a tensor. Returns a tuple of tensors as ``(the pivots, the L tensor, the U tensor)``. Arguments: LU_data (Tensor): the packed LU factorization data LU_pivots (Tensor): the packed LU factorization pivots unpack_data (bool): flag indicating if the data should be unpacked unpack_pivots (bool): flag indicating if the pivots should be unpacked Examples:: >>> A = torch.randn(2, 3, 3) >>> A_LU, pivots = A.lu() >>> P, A_L, A_U = torch.lu_unpack(A_LU, pivots) >>> >>> # can recover A from factorization >>> A_ = torch.bmm(P, torch.bmm(A_L, A_U)) >>> # LU factorization of a rectangular matrix: >>> A = torch.randn(2, 3, 2) >>> A_LU, pivots = A.lu() >>> P, A_L, A_U = torch.lu_unpack(A_LU, pivots) >>> P tensor([[[1., 0., 0.], [0., 1., 0.], [0., 0., 1.]], [[0., 0., 1.], [0., 1., 0.], [1., 0., 0.]]]) >>> A_L tensor([[[ 1.0000, 0.0000], [ 0.4763, 1.0000], [ 0.3683, 0.1135]], [[ 1.0000, 0.0000], [ 0.2957, 1.0000], [-0.9668, -0.3335]]]) >>> A_U tensor([[[ 2.1962, 1.0881], [ 0.0000, -0.8681]], [[-1.0947, 0.3736], [ 0.0000, 0.5718]]]) >>> A_ = torch.bmm(P, torch.bmm(A_L, A_U)) >>> torch.norm(A_ - A) tensor(2.9802e-08) """ if not torch.jit.is_scripting(): tens_ops = (LU_data, LU_pivots) if any([type(t) is not Tensor for t in tens_ops]) and has_torch_function(tens_ops): return handle_torch_function( lu_unpack, tens_ops, LU_data, LU_pivots, unpack_data=unpack_data, unpack_pivots=unpack_pivots) shape = LU_data.shape # In generalized LU factorization, the following shape relations hold: # A.shape[-2:] == (m, n) # P.shape[-2:] == (m, m) # L.shape[-2:] == (m, k) # U.shape[-2:] == (k, n) # where k = min(m, n) m, n = shape[-2:] k = min(m, n) if unpack_data: U: Optional[Tensor] = LU_data.triu() assert U is not None if m != k: U = U.narrow(-2, 0, k) L: Optional[Tensor] = LU_data.tril() assert L is not None if k != n: L = L.narrow(-1, 0, k) L.diagonal(dim1=-2, dim2=-1).fill_(1) else: L = U = None if unpack_pivots: LU_pivots_zero_idx = LU_pivots - 1 if LU_data.dim() > 2: P: Optional[Tensor] = torch.eye(m, device=LU_data.device, dtype=LU_data.dtype) \ .expand(shape[:-1] + (m,)) \ .clone(memory_format=torch.contiguous_format) assert P is not None # TODO: rewrite when TorchScript supports product and map as # product(*map(lambda x: list(range(x)), shape[:-2])) when issue 33781 is fixed indices = _indices_product(shape[:-2]) for idx in indices: final_order = [i for i in range(m)] # noqa: C416 TODO: rewrite as list(range(m)) for k, j in enumerate(_index_tensor_with_indices_list(LU_pivots_zero_idx, idx)): final_order[k], final_order[j] = final_order[j], final_order[k] # TODO: remove _index_tensor_with_indices_list when TorchScript supports indexing Tensor with list p_idx = _index_tensor_with_indices_list(P, idx) p_idx.copy_(p_idx.index_select(1, torch.as_tensor(final_order, device=LU_pivots.device))) else: P = torch.eye(m, device=LU_data.device, dtype=LU_data.dtype) final_order = [i for i in range(m)] # noqa: C416 TODO: rewrite as list(range(m)) for k, j, in enumerate(LU_pivots_zero_idx): final_order[k], final_order[j] = final_order[j], final_order[k] P = P.index_select(1, torch.as_tensor(final_order, device=LU_pivots.device)) else: P = None return P, L, U def einsum(equation, *operands): r"""einsum(equation, *operands) -> Tensor This function provides a way of computing multilinear expressions (i.e. sums of products) using the Einstein summation convention. Args: equation (string): The equation is given in terms of lower case letters (indices) to be associated with each dimension of the operands and result. The left hand side lists the operands dimensions, separated by commas. There should be one index letter per tensor dimension. The right hand side follows after `->` and gives the indices for the output. If the `->` and right hand side are omitted, it implicitly defined as the alphabetically sorted list of all indices appearing exactly once in the left hand side. The indices not apprearing in the output are summed over after multiplying the operands entries. If an index appears several times for the same operand, a diagonal is taken. Ellipses `...` represent a fixed number of dimensions. If the right hand side is inferred, the ellipsis dimensions are at the beginning of the output. operands (Tensor): The operands to compute the Einstein sum of. .. note:: This function does not optimize the given expression, so a different formula for the same computation may run faster or consume less memory. Projects like opt_einsum (https://optimized-einsum.readthedocs.io/en/stable/) can optimize the formula for you. Examples:: >>> x = torch.randn(5) >>> y = torch.randn(4) >>> torch.einsum('i,j->ij', x, y) # outer product tensor([[-0.0570, -0.0286, -0.0231, 0.0197], [ 1.2616, 0.6335, 0.5113, -0.4351], [ 1.4452, 0.7257, 0.5857, -0.4984], [-0.4647, -0.2333, -0.1883, 0.1603], [-1.1130, -0.5588, -0.4510, 0.3838]]) >>> A = torch.randn(3,5,4) >>> l = torch.randn(2,5) >>> r = torch.randn(2,4) >>> torch.einsum('bn,anm,bm->ba', l, A, r) # compare torch.nn.functional.bilinear tensor([[-0.3430, -5.2405, 0.4494], [ 0.3311, 5.5201, -3.0356]]) >>> As = torch.randn(3,2,5) >>> Bs = torch.randn(3,5,4) >>> torch.einsum('bij,bjk->bik', As, Bs) # batch matrix multiplication tensor([[[-1.0564, -1.5904, 3.2023, 3.1271], [-1.6706, -0.8097, -0.8025, -2.1183]], [[ 4.2239, 0.3107, -0.5756, -0.2354], [-1.4558, -0.3460, 1.5087, -0.8530]], [[ 2.8153, 1.8787, -4.3839, -1.2112], [ 0.3728, -2.1131, 0.0921, 0.8305]]]) >>> A = torch.randn(3, 3) >>> torch.einsum('ii->i', A) # diagonal tensor([-0.7825, 0.8291, -0.1936]) >>> A = torch.randn(4, 3, 3) >>> torch.einsum('...ii->...i', A) # batch diagonal tensor([[-1.0864, 0.7292, 0.0569], [-0.9725, -1.0270, 0.6493], [ 0.5832, -1.1716, -1.5084], [ 0.4041, -1.1690, 0.8570]]) >>> A = torch.randn(2, 3, 4, 5) >>> torch.einsum('...ij->...ji', A).shape # batch permute torch.Size([2, 3, 5, 4]) """ if not torch.jit.is_scripting(): if any(type(t) is not Tensor for t in operands) and has_torch_function(operands): return handle_torch_function(einsum, operands, equation, *operands) if len(operands) == 1 and isinstance(operands[0], (list, tuple)): # the old interface of passing the operands as one list argument _operands = operands[0] # recurse incase operands contains value that has torch function # in the original implementation this line is omitted return einsum(equation, *_operands) return _VF.einsum(equation, operands) # type: ignore if TYPE_CHECKING: # The JIT doesn't understand Union, so only add type annotation for mypy def meshgrid(*tensors: Union[Tensor, List[Tensor]]) -> Tuple[Tensor, ...]: return _meshgrid(*tensors) else: def meshgrid(*tensors): return _meshgrid(*tensors) def _meshgrid(*tensors): r"""Take :math:`N` tensors, each of which can be either scalar or 1-dimensional vector, and create :math:`N` N-dimensional grids, where the :math:`i` :sup:`th` grid is defined by expanding the :math:`i` :sup:`th` input over dimensions defined by other inputs. Args: tensors (list of Tensor): list of scalars or 1 dimensional tensors. Scalars will be treated as tensors of size :math:`(1,)` automatically Returns: seq (sequence of Tensors): If the input has :math:`k` tensors of size :math:`(N_1,), (N_2,), \ldots , (N_k,)`, then the output would also have :math:`k` tensors, where all tensors are of size :math:`(N_1, N_2, \ldots , N_k)`. Example:: >>> x = torch.tensor([1, 2, 3]) >>> y = torch.tensor([4, 5, 6]) >>> grid_x, grid_y = torch.meshgrid(x, y) >>> grid_x tensor([[1, 1, 1], [2, 2, 2], [3, 3, 3]]) >>> grid_y tensor([[4, 5, 6], [4, 5, 6], [4, 5, 6]]) """ if not torch.jit.is_scripting(): if any(type(t) is not Tensor for t in tensors) and has_torch_function(tensors): return handle_torch_function(meshgrid, tensors, *tensors) if len(tensors) == 1 and isinstance(tensors[0], (list, tuple)): # the old interface of passing the operands as one list argument tensors = tensors[0] # type: ignore return _VF.meshgrid(tensors) # type: ignore def stft(input: Tensor, n_fft: int, hop_length: Optional[int] = None, win_length: Optional[int] = None, window: Optional[Tensor] = None, center: bool = True, pad_mode: str = 'reflect', normalized: bool = False, onesided: Optional[bool] = None, return_complex: Optional[bool] = None) -> Tensor: r"""Short-time Fourier transform (STFT). .. warning:: Setting :attr:`return_complex` explicitly will be required in a future PyTorch release. Set it to False to preserve the current behavior or True to return a complex output. The STFT computes the Fourier transform of short overlapping windows of the input. This giving frequency components of the signal as they change over time. The interface of this function is modeled after the librosa_ stft function. .. _librosa: https://librosa.org/doc/latest/generated/librosa.stft.html Ignoring the optional batch dimension, this method computes the following expression: .. math:: X[m, \omega] = \sum_{k = 0}^{\text{win\_length-1}}% \text{window}[k]\ \text{input}[m \times \text{hop\_length} + k]\ % \exp\left(- j \frac{2 \pi \cdot \omega k}{\text{win\_length}}\right), where :math:`m` is the index of the sliding window, and :math:`\omega` is the frequency that :math:`0 \leq \omega < \text{n\_fft}`. When :attr:`onesided` is the default value ``True``, * :attr:`input` must be either a 1-D time sequence or a 2-D batch of time sequences. * If :attr:`hop_length` is ``None`` (default), it is treated as equal to ``floor(n_fft / 4)``. * If :attr:`win_length` is ``None`` (default), it is treated as equal to :attr:`n_fft`. * :attr:`window` can be a 1-D tensor of size :attr:`win_length`, e.g., from :meth:`torch.hann_window`. If :attr:`window` is ``None`` (default), it is treated as if having :math:`1` everywhere in the window. If :math:`\text{win\_length} < \text{n\_fft}`, :attr:`window` will be padded on both sides to length :attr:`n_fft` before being applied. * If :attr:`center` is ``True`` (default), :attr:`input` will be padded on both sides so that the :math:`t`-th frame is centered at time :math:`t \times \text{hop\_length}`. Otherwise, the :math:`t`-th frame begins at time :math:`t \times \text{hop\_length}`. * :attr:`pad_mode` determines the padding method used on :attr:`input` when :attr:`center` is ``True``. See :meth:`torch.nn.functional.pad` for all available options. Default is ``"reflect"``. * If :attr:`onesided` is ``True`` (default for real input), only values for :math:`\omega` in :math:`\left[0, 1, 2, \dots, \left\lfloor \frac{\text{n\_fft}}{2} \right\rfloor + 1\right]` are returned because the real-to-complex Fourier transform satisfies the conjugate symmetry, i.e., :math:`X[m, \omega] = X[m, \text{n\_fft} - \omega]^*`. Note if the input or window tensors are complex, then :attr:`onesided` output is not possible. * If :attr:`normalized` is ``True`` (default is ``False``), the function returns the normalized STFT results, i.e., multiplied by :math:`(\text{frame\_length})^{-0.5}`. * If :attr:`return_complex` is ``True`` (default if input is complex), the return is a ``input.dim() + 1`` dimensional complex tensor. If ``False``, the output is a ``input.dim() + 2`` dimensional real tensor where the last dimension represents the real and imaginary components. Returns either a complex tensor of size :math:`(* \times N \times T)` if :attr:`return_complex` is true, or a real tensor of size :math:`(* \times N \times T \times 2)`. Where :math:`*` is the optional batch size of :attr:`input`, :math:`N` is the number of frequencies where STFT is applied and :math:`T` is the total number of frames used. .. warning:: This function changed signature at version 0.4.1. Calling with the previous signature may cause error or return incorrect result. Arguments: input (Tensor): the input tensor n_fft (int): size of Fourier transform hop_length (int, optional): the distance between neighboring sliding window frames. Default: ``None`` (treated as equal to ``floor(n_fft / 4)``) win_length (int, optional): the size of window frame and STFT filter. Default: ``None`` (treated as equal to :attr:`n_fft`) window (Tensor, optional): the optional window function. Default: ``None`` (treated as window of all :math:`1` s) center (bool, optional): whether to pad :attr:`input` on both sides so that the :math:`t`-th frame is centered at time :math:`t \times \text{hop\_length}`. Default: ``True`` pad_mode (string, optional): controls the padding method used when :attr:`center` is ``True``. Default: ``"reflect"`` normalized (bool, optional): controls whether to return the normalized STFT results Default: ``False`` onesided (bool, optional): controls whether to return half of results to avoid redundancy for real inputs. Default: ``True`` for real :attr:`input` and :attr:`window`, ``False`` otherwise. return_complex (bool, optional): whether to return a complex tensor, or a real tensor with an extra last dimension for the real and imaginary components. Returns: Tensor: A tensor containing the STFT result with shape described above """ if not torch.jit.is_scripting(): if type(input) is not Tensor and has_torch_function((input,)): return handle_torch_function( stft, (input,), input, n_fft, hop_length=hop_length, win_length=win_length, window=window, center=center, pad_mode=pad_mode, normalized=normalized, onesided=onesided, return_complex=return_complex) # TODO: after having proper ways to map Python strings to ATen Enum, move # this and F.pad to ATen. if center: signal_dim = input.dim() extended_shape = [1] * (3 - signal_dim) + list(input.size()) pad = int(n_fft // 2) input = F.pad(input.view(extended_shape), (pad, pad), pad_mode) input = input.view(input.shape[-signal_dim:]) return _VF.stft(input, n_fft, hop_length, win_length, window, # type: ignore normalized, onesided, return_complex) def istft(input: Tensor, n_fft: int, hop_length: Optional[int] = None, win_length: Optional[int] = None, window: Optional[Tensor] = None, center: bool = True, normalized: bool = False, onesided: Optional[bool] = None, length: Optional[int] = None, return_complex: bool = False) -> Tensor: r"""Inverse short time Fourier Transform. This is expected to be the inverse of :func:`~torch.stft`. It has the same parameters (+ additional optional parameter of :attr:`length`) and it should return the least squares estimation of the original signal. The algorithm will check using the NOLA condition ( nonzero overlap). Important consideration in the parameters :attr:`window` and :attr:`center` so that the envelop created by the summation of all the windows is never zero at certain point in time. Specifically, :math:`\sum_{t=-\infty}^{\infty} |w|^2[n-t\times hop\_length] \cancel{=} 0`. Since :func:`~torch.stft` discards elements at the end of the signal if they do not fit in a frame, ``istft`` may return a shorter signal than the original signal (can occur if :attr:`center` is False since the signal isn't padded). If :attr:`center` is ``True``, then there will be padding e.g. ``'constant'``, ``'reflect'``, etc. Left padding can be trimmed off exactly because they can be calculated but right padding cannot be calculated without additional information. Example: Suppose the last window is: ``[17, 18, 0, 0, 0]`` vs ``[18, 0, 0, 0, 0]`` The :attr:`n_fft`, :attr:`hop_length`, :attr:`win_length` are all the same which prevents the calculation of right padding. These additional values could be zeros or a reflection of the signal so providing :attr:`length` could be useful. If :attr:`length` is ``None`` then padding will be aggressively removed (some loss of signal). [1] D. W. Griffin and J. S. Lim, "Signal estimation from modified short-time Fourier transform," IEEE Trans. ASSP, vol.32, no.2, pp.236-243, Apr. 1984. Arguments: input (Tensor): The input tensor. Expected to be output of :func:`~torch.stft`, can either be complex (``channel``, ``fft_size``, ``n_frame``), or real (``channel``, ``fft_size``, ``n_frame``, 2) where the ``channel`` dimension is optional. n_fft (int): Size of Fourier transform hop_length (Optional[int]): The distance between neighboring sliding window frames. (Default: ``n_fft // 4``) win_length (Optional[int]): The size of window frame and STFT filter. (Default: ``n_fft``) window (Optional[torch.Tensor]): The optional window function. (Default: ``torch.ones(win_length)``) center (bool): Whether :attr:`input` was padded on both sides so that the :math:`t`-th frame is centered at time :math:`t \times \text{hop\_length}`. (Default: ``True``) normalized (bool): Whether the STFT was normalized. (Default: ``False``) onesided (Optional[bool]): Whether the STFT was onesided. (Default: ``True`` if ``n_fft != fft_size`` in the input size) length (Optional[int]): The amount to trim the signal by (i.e. the original signal length). (Default: whole signal) return_complex (Optional[bool]): Whether the output should be complex, or if the input should be assumed to derive from a real signal and window. Note that this is incompatible with ``onesided=True``. (Default: ``False``) Returns: Tensor: Least squares estimation of the original signal of size (..., signal_length) """ if not torch.jit.is_scripting(): if type(input) is not Tensor and has_torch_function((input,)): return handle_torch_function( istft, (input,), input, n_fft, hop_length=hop_length, win_length=win_length, window=window, center=center, normalized=normalized, onesided=onesided, length=length, return_complex=return_complex) return _VF.istft(input, n_fft, hop_length, win_length, window, center, # type: ignore normalized, onesided, length, return_complex) del torch.unique_dim if TYPE_CHECKING: # These _impl functions return a variable number of tensors as output with # __torch_function__; tuple unpacking is done already rather than being # done by the caller of the _impl function _unique_impl_out = Any else: _unique_impl_out = Tuple[Tensor, Tensor, Tensor] def _unique_impl(input: Tensor, sorted: bool = True, return_inverse: bool = False, return_counts: bool = False, dim: Optional[int] = None) -> _unique_impl_out: r"""Returns the unique elements of the input tensor. .. note:: This function is different from :func:`torch.unique_consecutive` in the sense that this function also eliminates non-consecutive duplicate values. .. note:: Currently in the CUDA implementation and the CPU implementation when dim is specified, `torch.unique` always sort the tensor at the beginning regardless of the `sort` argument. Sorting could be slow, so if your input tensor is already sorted, it is recommended to use :func:`torch.unique_consecutive` which avoids the sorting. Arguments: input (Tensor): the input tensor sorted (bool): Whether to sort the unique elements in ascending order before returning as output. return_inverse (bool): Whether to also return the indices for where elements in the original input ended up in the returned unique list. return_counts (bool): Whether to also return the counts for each unique element. dim (int): the dimension to apply unique. If ``None``, the unique of the flattened input is returned. default: ``None`` Returns: (Tensor, Tensor (optional), Tensor (optional)): A tensor or a tuple of tensors containing - **output** (*Tensor*): the output list of unique scalar elements. - **inverse_indices** (*Tensor*): (optional) if :attr:`return_inverse` is True, there will be an additional returned tensor (same shape as input) representing the indices for where elements in the original input map to in the output; otherwise, this function will only return a single tensor. - **counts** (*Tensor*): (optional) if :attr:`return_counts` is True, there will be an additional returned tensor (same shape as output or output.size(dim), if dim was specified) representing the number of occurrences for each unique value or tensor. Example:: >>> output = torch.unique(torch.tensor([1, 3, 2, 3], dtype=torch.long)) >>> output tensor([ 2, 3, 1]) >>> output, inverse_indices = torch.unique( torch.tensor([1, 3, 2, 3], dtype=torch.long), sorted=True, return_inverse=True) >>> output tensor([ 1, 2, 3]) >>> inverse_indices tensor([ 0, 2, 1, 2]) >>> output, inverse_indices = torch.unique( torch.tensor([[1, 3], [2, 3]], dtype=torch.long), sorted=True, return_inverse=True) >>> output tensor([ 1, 2, 3]) >>> inverse_indices tensor([[ 0, 2], [ 1, 2]]) """ if not torch.jit.is_scripting(): if type(input) is not Tensor and has_torch_function((input,)): return handle_torch_function( unique, (input,), input, sorted=sorted, return_inverse=return_inverse, return_counts=return_counts, dim=dim) if dim is not None: output, inverse_indices, counts = _VF.unique_dim( # type: ignore input, dim, sorted=sorted, return_inverse=return_inverse, return_counts=return_counts, ) else: output, inverse_indices, counts = torch._unique2( input, sorted=sorted, return_inverse=return_inverse, return_counts=return_counts, ) return output, inverse_indices, counts def _unique_consecutive_impl(input: Tensor, return_inverse: bool = False, return_counts: bool = False, dim: Optional[int] = None) -> _unique_impl_out: r"""Eliminates all but the first element from every consecutive group of equivalent elements. .. note:: This function is different from :func:`torch.unique` in the sense that this function only eliminates consecutive duplicate values. This semantics is similar to `std::unique` in C++. Arguments: input (Tensor): the input tensor return_inverse (bool): Whether to also return the indices for where elements in the original input ended up in the returned unique list. return_counts (bool): Whether to also return the counts for each unique element. dim (int): the dimension to apply unique. If ``None``, the unique of the flattened input is returned. default: ``None`` Returns: (Tensor, Tensor (optional), Tensor (optional)): A tensor or a tuple of tensors containing - **output** (*Tensor*): the output list of unique scalar elements. - **inverse_indices** (*Tensor*): (optional) if :attr:`return_inverse` is True, there will be an additional returned tensor (same shape as input) representing the indices for where elements in the original input map to in the output; otherwise, this function will only return a single tensor. - **counts** (*Tensor*): (optional) if :attr:`return_counts` is True, there will be an additional returned tensor (same shape as output or output.size(dim), if dim was specified) representing the number of occurrences for each unique value or tensor. Example:: >>> x = torch.tensor([1, 1, 2, 2, 3, 1, 1, 2]) >>> output = torch.unique_consecutive(x) >>> output tensor([1, 2, 3, 1, 2]) >>> output, inverse_indices = torch.unique_consecutive(x, return_inverse=True) >>> output tensor([1, 2, 3, 1, 2]) >>> inverse_indices tensor([0, 0, 1, 1, 2, 3, 3, 4]) >>> output, counts = torch.unique_consecutive(x, return_counts=True) >>> output tensor([1, 2, 3, 1, 2]) >>> counts tensor([2, 2, 1, 2, 1]) """ if not torch.jit.is_scripting(): if type(input) is not Tensor and has_torch_function((input,)): return handle_torch_function( unique_consecutive, (input,), input, return_inverse=return_inverse, return_counts=return_counts, dim=dim) output, inverse_indices, counts = _VF.unique_consecutive( # type: ignore input, return_inverse=return_inverse, return_counts=return_counts, dim=dim) return output, inverse_indices, counts def _return_counts(input, sorted=True, return_inverse=False, return_counts=False, dim=None): # type: (Tensor, bool, bool, bool, Optional[int]) -> Tuple[Tensor, Tensor] if not torch.jit.is_scripting(): if type(input) is not Tensor and has_torch_function((input,)): return _unique_impl(input, sorted, return_inverse, return_counts, dim) output, _, counts = _unique_impl(input, sorted, return_inverse, return_counts, dim) return output, counts def _return_output(input, sorted=True, return_inverse=False, return_counts=False, dim=None): # type: (Tensor, bool, bool, bool, Optional[int]) -> Tensor if not torch.jit.is_scripting(): if type(input) is not Tensor and has_torch_function((input,)): return _unique_impl(input, sorted, return_inverse, return_counts, dim) output, _, _ = _unique_impl(input, sorted, return_inverse, return_counts, dim) return output def _return_inverse(input, sorted=True, return_inverse=False, return_counts=False, dim=None): # type: (Tensor, bool, bool, bool, Optional[int]) -> Tuple[Tensor, Tensor] if not torch.jit.is_scripting(): if type(input) is not Tensor and has_torch_function((input,)): return _unique_impl(input, sorted, return_inverse, return_counts, dim) output, inverse_indices, _ = _unique_impl(input, sorted, return_inverse, return_counts, dim) return output, inverse_indices _return_inverse_false = boolean_dispatch( arg_name='return_counts', arg_index=3, default=False, if_true=_return_counts, if_false=_return_output, module_name=__name__, func_name='unique') _return_inverse_true = boolean_dispatch( arg_name='return_counts', arg_index=3, default=False, if_true=_unique_impl, if_false=_return_inverse, module_name=__name__, func_name='unique') # The return type of unique depends on `return_inverse`, and `return_counts` so in order to # resolve the output type in TorchScript we need to statically know the value of both parameters unique = boolean_dispatch( arg_name='return_inverse', arg_index=2, default=False, if_true=_return_inverse_true, if_false=_return_inverse_false, module_name=__name__, func_name='unique') unique.__doc__ = _unique_impl.__doc__ def _consecutive_return_counts(input, return_inverse=False, return_counts=False, dim=None): # type: (Tensor, bool, bool, Optional[int]) -> Tuple[Tensor, Tensor] if not torch.jit.is_scripting(): if type(input) is not Tensor and has_torch_function((input,)): return _unique_consecutive_impl(input, return_inverse, return_counts, dim) output, _, counts = _unique_consecutive_impl(input, return_inverse, return_counts, dim) return output, counts def _consecutive_return_output(input, return_inverse=False, return_counts=False, dim=None): # type: (Tensor, bool, bool, Optional[int]) -> Tensor if not torch.jit.is_scripting(): if type(input) is not Tensor and has_torch_function((input,)): return _unique_consecutive_impl(input, return_inverse, return_counts, dim) output, _, _ = _unique_consecutive_impl(input, return_inverse, return_counts, dim) return output def _consecutive_return_inverse(input, return_inverse=False, return_counts=False, dim=None): # type: (Tensor, bool, bool, Optional[int]) -> Tuple[Tensor, Tensor] if not torch.jit.is_scripting(): if type(input) is not Tensor and has_torch_function((input,)): return _unique_consecutive_impl(input, return_inverse, return_counts, dim) output, inverse_indices, _ = _unique_consecutive_impl(input, return_inverse, return_counts, dim) return output, inverse_indices _consecutive_return_inverse_false = boolean_dispatch( arg_name='return_counts', arg_index=1, default=False, if_true=_consecutive_return_counts, if_false=_consecutive_return_output, module_name=__name__, func_name='unique_consecutive') _consecutive_return_inverse_true = boolean_dispatch( arg_name='return_counts', arg_index=1, default=False, if_true=_unique_consecutive_impl, if_false=_consecutive_return_inverse, module_name=__name__, func_name='unique_consecutive') # The return type of unique depends on `return_inverse`, and `return_counts` so in order to # resolve the output type in TorchScript we need to statically know the value of both parameters unique_consecutive = boolean_dispatch( arg_name='return_inverse', arg_index=2, default=False, if_true=_consecutive_return_inverse_true, if_false=_consecutive_return_inverse_false, module_name=__name__, func_name='unique_consecutive') unique_consecutive.__doc__ = _unique_consecutive_impl.__doc__ def tensordot(a, b, dims=2): r"""Returns a contraction of a and b over multiple dimensions. :attr:`tensordot` implements a generalized matrix product. Args: a (Tensor): Left tensor to contract b (Tensor): Right tensor to contract dims (int or tuple of two lists of integers): number of dimensions to contract or explicit lists of dimensions for :attr:`a` and :attr:`b` respectively When called with a non-negative integer argument :attr:`dims` = :math:`d`, and the number of dimensions of :attr:`a` and :attr:`b` is :math:`m` and :math:`n`, respectively, :func:`~torch.tensordot` computes .. math:: r_{i_0,...,i_{m-d}, i_d,...,i_n} = \sum_{k_0,...,k_{d-1}} a_{i_0,...,i_{m-d},k_0,...,k_{d-1}} \times b_{k_0,...,k_{d-1}, i_d,...,i_n}. When called with :attr:`dims` of the list form, the given dimensions will be contracted in place of the last :math:`d` of :attr:`a` and the first :math:`d` of :math:`b`. The sizes in these dimensions must match, but :func:`~torch.tensordot` will deal with broadcasted dimensions. Examples:: >>> a = torch.arange(60.).reshape(3, 4, 5) >>> b = torch.arange(24.).reshape(4, 3, 2) >>> torch.tensordot(a, b, dims=([1, 0], [0, 1])) tensor([[4400., 4730.], [4532., 4874.], [4664., 5018.], [4796., 5162.], [4928., 5306.]]) >>> a = torch.randn(3, 4, 5, device='cuda') >>> b = torch.randn(4, 5, 6, device='cuda') >>> c = torch.tensordot(a, b, dims=2).cpu() tensor([[ 8.3504, -2.5436, 6.2922, 2.7556, -1.0732, 3.2741], [ 3.3161, 0.0704, 5.0187, -0.4079, -4.3126, 4.8744], [ 0.8223, 3.9445, 3.2168, -0.2400, 3.4117, 1.7780]]) """ if not torch.jit.is_scripting(): if (type(a) is not Tensor or type(b) is not Tensor) and has_torch_function((a, b)): return handle_torch_function(tensordot, (a, b), a, b, dims=dims) if isinstance(dims, (list, tuple)) or \ (isinstance(dims, torch.Tensor) and dims.numel() > 1): dims_a, dims_b = dims else: if isinstance(dims, torch.Tensor): dims = dims.item() if dims < 0: raise RuntimeError(f"tensordot expects dims >= 0, but got dims={dims}") dims_a = list(range(-dims, 0)) dims_b = list(range(dims)) return _VF.tensordot(a, b, dims_a, dims_b) # type: ignore def cartesian_prod(*tensors): """Do cartesian product of the given sequence of tensors. The behavior is similar to python's `itertools.product`. Arguments: *tensors: any number of 1 dimensional tensors. Returns: Tensor: A tensor equivalent to converting all the input tensors into lists, do `itertools.product` on these lists, and finally convert the resulting list into tensor. Example:: >>> a = [1, 2, 3] >>> b = [4, 5] >>> list(itertools.product(a, b)) [(1, 4), (1, 5), (2, 4), (2, 5), (3, 4), (3, 5)] >>> tensor_a = torch.tensor(a) >>> tensor_b = torch.tensor(b) >>> torch.cartesian_prod(tensor_a, tensor_b) tensor([[1, 4], [1, 5], [2, 4], [2, 5], [3, 4], [3, 5]]) """ if not torch.jit.is_scripting(): if any(type(t) is not Tensor for t in tensors) and has_torch_function(tensors): return handle_torch_function(cartesian_prod, tensors, *tensors) return _VF.cartesian_prod(tensors) # type: ignore def block_diag(*tensors): """Create a block diagonal matrix from provided tensors. Arguments: *tensors: One or more tensors with 0, 1, or 2 dimensions. Returns: Tensor: A 2 dimensional tensor with all the input tensors arranged in order such that their upper left and lower right corners are diagonally adjacent. All other elements are set to 0. Example:: >>> import torch >>> A = torch.tensor([[0, 1], [1, 0]]) >>> B = torch.tensor([[3, 4, 5], [6, 7, 8]]) >>> C = torch.tensor(7) >>> D = torch.tensor([1, 2, 3]) >>> E = torch.tensor([[4], [5], [6]]) >>> torch.block_diag(A, B, C, D, E) tensor([[0, 1, 0, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 3, 4, 5, 0, 0, 0, 0, 0], [0, 0, 6, 7, 8, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 7, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 1, 2, 3, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 4], [0, 0, 0, 0, 0, 0, 0, 0, 0, 5], [0, 0, 0, 0, 0, 0, 0, 0, 0, 6]]) """ if any(type(t) is not Tensor for t in tensors) and has_torch_function(tensors): return handle_torch_function(block_diag, tensors, *tensors) return torch._C._VariableFunctions.block_diag(tensors) # type: ignore def cdist(x1, x2, p=2., compute_mode='use_mm_for_euclid_dist_if_necessary'): # type: (Tensor, Tensor, float, str) -> (Tensor) r"""Computes batched the p-norm distance between each pair of the two collections of row vectors. Args: x1 (Tensor): input tensor of shape :math:`B \times P \times M`. x2 (Tensor): input tensor of shape :math:`B \times R \times M`. p: p value for the p-norm distance to calculate between each vector pair :math:`\in [0, \infty]`. compute_mode: 'use_mm_for_euclid_dist_if_necessary' - will use matrix multiplication approach to calculate euclidean distance (p = 2) if P > 25 or R > 25 'use_mm_for_euclid_dist' - will always use matrix multiplication approach to calculate euclidean distance (p = 2) 'donot_use_mm_for_euclid_dist' - will never use matrix multiplication approach to calculate euclidean distance (p = 2) Default: use_mm_for_euclid_dist_if_necessary. If x1 has shape :math:`B \times P \times M` and x2 has shape :math:`B \times R \times M` then the output will have shape :math:`B \times P \times R`. This function is equivalent to `scipy.spatial.distance.cdist(input,'minkowski', p=p)` if :math:`p \in (0, \infty)`. When :math:`p = 0` it is equivalent to `scipy.spatial.distance.cdist(input, 'hamming') * M`. When :math:`p = \infty`, the closest scipy function is `scipy.spatial.distance.cdist(xn, lambda x, y: np.abs(x - y).max())`. Example: >>> a = torch.tensor([[0.9041, 0.0196], [-0.3108, -2.4423], [-0.4821, 1.059]]) >>> a tensor([[ 0.9041, 0.0196], [-0.3108, -2.4423], [-0.4821, 1.0590]]) >>> b = torch.tensor([[-2.1763, -0.4713], [-0.6986, 1.3702]]) >>> b tensor([[-2.1763, -0.4713], [-0.6986, 1.3702]]) >>> torch.cdist(a, b, p=2) tensor([[3.1193, 2.0959], [2.7138, 3.8322], [2.2830, 0.3791]]) """ if not torch.jit.is_scripting(): if (type(x1) is not Tensor or type(x2) is not Tensor) and has_torch_function((x1, x2)): return handle_torch_function( cdist, (x1, x2), x1, x2, p=p, compute_mode=compute_mode) if compute_mode == 'use_mm_for_euclid_dist_if_necessary': return _VF.cdist(x1, x2, p, None) # type: ignore elif compute_mode == 'use_mm_for_euclid_dist': return _VF.cdist(x1, x2, p, 1) # type: ignore elif compute_mode == 'donot_use_mm_for_euclid_dist': return _VF.cdist(x1, x2, p, 2) # type: ignore else: raise ValueError(f"{compute_mode} is not a valid value for compute_mode") def atleast_1d(*tensors): r""" Returns a 1-dimensional view of each input tensor with zero dimensions. Input tensors with one or more dimensions are returned as-is. Args: input (Tensor or list of Tensors) Returns: output (Tensor or tuple of Tensors) Example:: >>> x = torch.randn(2) >>> x tensor([1.4584, 0.7583]) >>> torch.atleast_1d(x) tensor([1.4584, 0.7583]) >>> x = torch.tensor(1.) >>> x tensor(1.) >>> torch.atleast_1d(x) tensor([1.]) >>> x = torch.tensor(0.5) >>> y = torch.tensor(1.) >>> torch.atleast_1d((x,y)) (tensor([0.5000]), tensor([1.])) """ if not torch.jit.is_scripting(): if any(type(t) is not Tensor for t in tensors) and has_torch_function(tensors): return handle_torch_function(atleast_1d, tensors, *tensors) if len(tensors) == 1: tensors = tensors[0] return _VF.atleast_1d(tensors) # type: ignore def atleast_2d(*tensors): r""" Returns a 2-dimensional view of each each input tensor with zero dimensions. Input tensors with two or more dimensions are returned as-is. Args: input (Tensor or list of Tensors) Returns: output (Tensor or tuple of Tensors) Example:: >>> x = torch.tensor(1.) >>> x tensor(1.) >>> torch.atleast_2d(x) tensor([[1.]]) >>> x = torch.randn(2,2) >>> x tensor([[2.2086, 2.5165], [0.1757, 0.5194]]) >>> torch.atleast_2d(x) tensor([[2.2086, 2.5165], [0.1757, 0.5194]]) >>> x = torch.tensor(0.5) >>> y = torch.tensor(1.) >>> torch.atleast_2d((x,y)) (tensor([[0.5000]]), tensor([[1.]])) """ if not torch.jit.is_scripting(): if any(type(t) is not Tensor for t in tensors) and has_torch_function(tensors): return handle_torch_function(atleast_2d, tensors, *tensors) if len(tensors) == 1: tensors = tensors[0] return _VF.atleast_2d(tensors) # type: ignore def atleast_3d(*tensors): r""" Returns a 3-dimensional view of each each input tensor with zero dimensions. Input tensors with three or more dimensions are returned as-is. Args: input (Tensor or list of Tensors) Returns: output (Tensor or tuple of Tensors) Example: >>> x = torch.tensor(0.5) >>> x tensor(0.5000) >>> torch.atleast_3d(x) tensor([[[0.5000]]]) >>> y = torch.randn(2,2) >>> y tensor([[-0.8079, 0.7460], [-1.1647, 1.4734]]) >>> torch.atleast_3d(y) tensor([[[-0.8079], [ 0.7460]], [[-1.1647], [ 1.4734]]]) >>> x = torch.randn(1,1,1) >>> x tensor([[[-1.5689]]]) >>> torch.atleast_3d(x) tensor([[[-1.5689]]]) >>> x = torch.tensor(0.5) >>> y = torch.tensor(1.) >>> torch.atleast_3d((x,y)) (tensor([[[0.5000]]]), tensor([[[1.]]])) """ if not torch.jit.is_scripting(): if any(type(t) is not Tensor for t in tensors) and has_torch_function(tensors): return handle_torch_function(atleast_3d, tensors, *tensors) if len(tensors) == 1: tensors = tensors[0] return _VF.atleast_3d(tensors) # type: ignore if TYPE_CHECKING: pass # There's no good way to use this type annotation; cannot rename norm() to # _norm_impl() in a way that doesn't break JIT overloads. So leave untyped # for mypy for now. # def norm(input: Tensor, # p: Optional[Union[str, Number]] = "fro", # dim: Optional[Union[int, List[int]]] = None, # keepdim: bool = False, # out: Optional[Tensor] = None, # dtype: _dtype = None) -> Tensor: # return _norm_impl(input, p, dim, keepdim, out, dtype) else: # TODO: type dim as BroadcastingList when # https://github.com/pytorch/pytorch/issues/33782 is fixed @overload # noqa: 749 def norm(input, p="fro", dim=None, keepdim=False, out=None, dtype=None): # noqa: 749 # type: (Tensor, str, Optional[List[int]], bool, Optional[Tensor], Optional[int]) -> Tensor pass @overload # noqa: 749 def norm(input, p="fro", dim=None, keepdim=False, out=None, dtype=None): # noqa: 749 # type: (Tensor, Optional[number], Optional[List[int]], bool, Optional[Tensor], Optional[int]) -> Tensor pass @overload # noqa: 749 def norm(input, p="fro", dim=None, keepdim=False, out=None, dtype=None): # noqa: 749 # type: (Tensor, Optional[number], Optional[int], bool, Optional[Tensor], Optional[int]) -> Tensor pass @overload # noqa: 749 def norm(input, p="fro", dim=None, keepdim=False, out=None, dtype=None): # noqa: 749 # type: (Tensor, str, Optional[int], bool, Optional[Tensor], Optional[int]) -> Tensor pass def norm(input, p="fro", dim=None, keepdim=False, out=None, dtype=None): # noqa: 749 r"""Returns the matrix norm or vector norm of a given tensor. .. warning:: torch.norm is deprecated and may be removed in a future PyTorch release. Use :func:`torch.linalg.norm` instead, but note that :func:`torch.linalg.norm` has a different signature and slightly different behavior that is more consistent with NumPy's numpy.linalg.norm. Args: input (Tensor): the input tensor p (int, float, inf, -inf, 'fro', 'nuc', optional): the order of norm. Default: ``'fro'`` The following norms can be calculated: ===== ============================ ========================== ord matrix norm vector norm ===== ============================ ========================== None Frobenius norm 2-norm 'fro' Frobenius norm -- 'nuc' nuclear norm -- Other as vec norm when dim is None sum(abs(x)**ord)**(1./ord) ===== ============================ ========================== dim (int, 2-tuple of ints, 2-list of ints, optional): If it is an int, vector norm will be calculated, if it is 2-tuple of ints, matrix norm will be calculated. If the value is None, matrix norm will be calculated when the input tensor only has two dimensions, vector norm will be calculated when the input tensor only has one dimension. If the input tensor has more than two dimensions, the vector norm will be applied to last dimension. keepdim (bool, optional): whether the output tensors have :attr:`dim` retained or not. Ignored if :attr:`dim` = ``None`` and :attr:`out` = ``None``. Default: ``False`` out (Tensor, optional): the output tensor. Ignored if :attr:`dim` = ``None`` and :attr:`out` = ``None``. dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor. If specified, the input tensor is casted to :attr:'dtype' while performing the operation. Default: None. Example:: >>> import torch >>> a = torch.arange(9, dtype= torch.float) - 4 >>> b = a.reshape((3, 3)) >>> torch.norm(a) tensor(7.7460) >>> torch.norm(b) tensor(7.7460) >>> torch.norm(a, float('inf')) tensor(4.) >>> torch.norm(b, float('inf')) tensor(4.) >>> c = torch.tensor([[ 1, 2, 3],[-1, 1, 4]] , dtype= torch.float) >>> torch.norm(c, dim=0) tensor([1.4142, 2.2361, 5.0000]) >>> torch.norm(c, dim=1) tensor([3.7417, 4.2426]) >>> torch.norm(c, p=1, dim=1) tensor([6., 6.]) >>> d = torch.arange(8, dtype= torch.float).reshape(2,2,2) >>> torch.norm(d, dim=(1,2)) tensor([ 3.7417, 11.2250]) >>> torch.norm(d[0, :, :]), torch.norm(d[1, :, :]) (tensor(3.7417), tensor(11.2250)) """ if not torch.jit.is_scripting(): if type(input) is not Tensor and has_torch_function((input,)): return handle_torch_function( norm, (input,), input, p=p, dim=dim, keepdim=keepdim, out=out, dtype=dtype) ndim = input.dim() # catch default case if dim is None and out is None and dtype is None and p is not None: if isinstance(p, str): if p == "fro": return _VF.frobenius_norm(input, dim=(), keepdim=keepdim) # type: ignore if not isinstance(p, str): _dim = [i for i in range(ndim)] # noqa: C416 TODO: rewrite as list(range(m)) return _VF.norm(input, p, dim=_dim, keepdim=keepdim) # type: ignore # TODO: when https://github.com/pytorch/pytorch/issues/33782 is fixed # remove the overloads where dim is an int and replace with BraodcastingList1 # and remove next four lines, replace _dim with dim if dim is not None: if isinstance(dim, int): _dim = [dim] else: _dim = dim else: _dim = None # type: ignore if isinstance(p, str): if p == "fro": if dtype is not None: raise ValueError("dtype argument is not supported in frobenius norm") if _dim is None: _dim = [i for i in range(ndim)] # noqa: C416 TODO: rewrite as list(range(m)) if out is None: return _VF.frobenius_norm(input, _dim, keepdim=keepdim) # type: ignore else: return _VF.frobenius_norm(input, _dim, keepdim=keepdim, out=out) # type: ignore elif p == "nuc": if dtype is not None: raise ValueError("dtype argument is not supported in nuclear norm") if _dim is None: if out is None: return _VF.nuclear_norm(input, keepdim=keepdim) # type: ignore else: return _VF.nuclear_norm(input, keepdim=keepdim, out=out) # type: ignore else: if out is None: return _VF.nuclear_norm(input, _dim, keepdim=keepdim) # type: ignore else: return _VF.nuclear_norm(input, _dim, keepdim=keepdim, out=out) # type: ignore raise RuntimeError(f"only valid string values are 'fro' and 'nuc', found {p}") else: if _dim is None: _dim = [i for i in range(ndim)] # noqa: C416 TODO: rewrite as list(range(m)) if out is None: if dtype is None: return _VF.norm(input, p, _dim, keepdim=keepdim) # type: ignore else: return _VF.norm(input, p, _dim, keepdim=keepdim, dtype=dtype) # type: ignore else: if dtype is None: return _VF.norm(input, p, _dim, keepdim=keepdim, out=out) # type: ignore else: return _VF.norm(input, p, _dim, keepdim=keepdim, dtype=dtype, out=out) # type: ignore def chain_matmul(*matrices): r"""Returns the matrix product of the :math:`N` 2-D tensors. This product is efficiently computed using the matrix chain order algorithm which selects the order in which incurs the lowest cost in terms of arithmetic operations (`[CLRS]`_). Note that since this is a function to compute the product, :math:`N` needs to be greater than or equal to 2; if equal to 2 then a trivial matrix-matrix product is returned. If :math:`N` is 1, then this is a no-op - the original matrix is returned as is. Args: matrices (Tensors...): a sequence of 2 or more 2-D tensors whose product is to be determined. Returns: Tensor: if the :math:`i^{th}` tensor was of dimensions :math:`p_{i} \times p_{i + 1}`, then the product would be of dimensions :math:`p_{1} \times p_{N + 1}`. Example:: >>> a = torch.randn(3, 4) >>> b = torch.randn(4, 5) >>> c = torch.randn(5, 6) >>> d = torch.randn(6, 7) >>> torch.chain_matmul(a, b, c, d) tensor([[ -2.3375, -3.9790, -4.1119, -6.6577, 9.5609, -11.5095, -3.2614], [ 21.4038, 3.3378, -8.4982, -5.2457, -10.2561, -2.4684, 2.7163], [ -0.9647, -5.8917, -2.3213, -5.2284, 12.8615, -12.2816, -2.5095]]) .. _`[CLRS]`: https://mitpress.mit.edu/books/introduction-algorithms-third-edition """ if not torch.jit.is_scripting(): if any(type(t) is not Tensor for t in matrices) and has_torch_function(matrices): return handle_torch_function(chain_matmul, matrices, *matrices) return _VF.chain_matmul(matrices) # type: ignore def _lu_impl(A, pivot=True, get_infos=False, out=None): # type: (Tensor, bool, bool, Any) -> Tuple[Tensor, Tensor, Tensor] r"""Computes the LU factorization of a matrix or batches of matrices :attr:`A`. Returns a tuple containing the LU factorization and pivots of :attr:`A`. Pivoting is done if :attr:`pivot` is set to ``True``. .. note:: The pivots returned by the function are 1-indexed. If :attr:`pivot` is ``False``, then the returned pivots is a tensor filled with zeros of the appropriate size. .. note:: LU factorization with :attr:`pivot` = ``False`` is not available for CPU, and attempting to do so will throw an error. However, LU factorization with :attr:`pivot` = ``False`` is available for CUDA. .. note:: This function does not check if the factorization was successful or not if :attr:`get_infos` is ``True`` since the status of the factorization is present in the third element of the return tuple. .. note:: In the case of batches of square matrices with size less or equal to 32 on a CUDA device, the LU factorization is repeated for singular matrices due to the bug in the MAGMA library (see magma issue 13). .. note:: ``L``, ``U``, and ``P`` can be derived using :func:`torch.lu_unpack`. Arguments: A (Tensor): the tensor to factor of size :math:`(*, m, n)` pivot (bool, optional): controls whether pivoting is done. Default: ``True`` get_infos (bool, optional): if set to ``True``, returns an info IntTensor. Default: ``False`` out (tuple, optional): optional output tuple. If :attr:`get_infos` is ``True``, then the elements in the tuple are Tensor, IntTensor, and IntTensor. If :attr:`get_infos` is ``False``, then the elements in the tuple are Tensor, IntTensor. Default: ``None`` Returns: (Tensor, IntTensor, IntTensor (optional)): A tuple of tensors containing - **factorization** (*Tensor*): the factorization of size :math:`(*, m, n)` - **pivots** (*IntTensor*): the pivots of size :math:`(*, m)` - **infos** (*IntTensor*, *optional*): if :attr:`get_infos` is ``True``, this is a tensor of size :math:`(*)` where non-zero values indicate whether factorization for the matrix or each minibatch has succeeded or failed Example:: >>> A = torch.randn(2, 3, 3) >>> A_LU, pivots = torch.lu(A) >>> A_LU tensor([[[ 1.3506, 2.5558, -0.0816], [ 0.1684, 1.1551, 0.1940], [ 0.1193, 0.6189, -0.5497]], [[ 0.4526, 1.2526, -0.3285], [-0.7988, 0.7175, -0.9701], [ 0.2634, -0.9255, -0.3459]]]) >>> pivots tensor([[ 3, 3, 3], [ 3, 3, 3]], dtype=torch.int32) >>> A_LU, pivots, info = torch.lu(A, get_infos=True) >>> if info.nonzero().size(0) == 0: ... print('LU factorization succeeded for all samples!') LU factorization succeeded for all samples! """ # If get_infos is True, then we don't need to check for errors and vice versa return torch._lu_with_info(A, pivot=pivot, check_errors=(not get_infos)) if TYPE_CHECKING: _ListOrSeq = Sequence[Tensor] else: _ListOrSeq = List[Tensor] def _check_list_size(out_len: int, get_infos: bool, out: _ListOrSeq) -> None: get_infos_int = 1 if get_infos else 0 if out_len - get_infos_int != 2: raise TypeError(f"expected tuple of {2 + int(get_infos)} elements but got {out_len}") if not isinstance(out, (tuple, list)): raise TypeError(f"argument 'out' must be tuple of Tensors, not {type(out).__name__}") def _lu_with_infos(A, pivot=True, get_infos=False, out=None): # type: (Tensor, bool, bool, Optional[Tuple[Tensor, Tensor, Tensor]]) -> Tuple[Tensor, Tensor, Tensor] if not torch.jit.is_scripting(): if type(A) is not Tensor and has_torch_function((A,)): return handle_torch_function( lu, (A,), A, pivot=pivot, get_infos=get_infos, out=out) result = _lu_impl(A, pivot, get_infos, out) if out is not None: _check_list_size(len(out), get_infos, out) for i in range(len(out)): out[i].resize_as_(result[i]).copy_(result[i]) return out else: return result # A_LU, pivots, infos def _lu_no_infos(A, pivot=True, get_infos=False, out=None): # type: (Tensor, bool, bool, Optional[Tuple[Tensor, Tensor]]) -> Tuple[Tensor, Tensor] # need to check for torch_function here so that we exit if if not torch.jit.is_scripting(): if type(A) is not Tensor and has_torch_function((A,)): return handle_torch_function( lu, (A,), A, pivot=pivot, get_infos=get_infos, out=out) result = _lu_impl(A, pivot, get_infos, out) if out is not None: _check_list_size(len(out), get_infos, out) for i in range(len(out)): out[i].resize_as_(result[i]).copy_(result[i]) return out else: return result[0], result[1] # A_LU, pivots # The return type of lu depends on `get_infos`, so in order to resolve the output type # of lu in TorchScript we need to statically know the value of `get_infos` lu = boolean_dispatch( arg_name='get_infos', arg_index=2, default=False, if_true=_lu_with_infos, if_false=_lu_no_infos, module_name=__name__, func_name='lu') lu.__doc__ = _lu_impl.__doc__ def align_tensors(*tensors): raise RuntimeError('`align_tensors` not yet implemented.') ================================================ FILE: patches/pytorch/1.7.0/functional.py ================================================ from typing import ( Tuple, Optional, Union, Any, Sequence, TYPE_CHECKING ) import librosa # STFT patch for aarch64 import numpy as np import torch import torch.nn.functional as F from torch.types import _size from ._lowrank import svd_lowrank, pca_lowrank from .overrides import has_torch_function, handle_torch_function from ._jit_internal import boolean_dispatch, List from ._jit_internal import _overload as overload Tensor = torch.Tensor from torch import _VF __all__ = [ 'atleast_1d', 'atleast_2d', 'atleast_3d', 'align_tensors', 'broadcast_tensors', 'cartesian_prod', 'block_diag', 'cdist', 'chain_matmul', 'einsum', 'istft', 'lu', 'lu_unpack', 'norm', 'meshgrid', 'pca_lowrank', 'split', 'stft', 'svd_lowrank', 'tensordot', 'unique', 'unique_consecutive', ] def broadcast_tensors(*tensors): r"""broadcast_tensors(*tensors) -> List of Tensors Broadcasts the given tensors according to :ref:`broadcasting-semantics`. Args: *tensors: any number of tensors of the same type .. warning:: More than one element of a broadcasted tensor may refer to a single memory location. As a result, in-place operations (especially ones that are vectorized) may result in incorrect behavior. If you need to write to the tensors, please clone them first. Example:: >>> x = torch.arange(3).view(1, 3) >>> y = torch.arange(2).view(2, 1) >>> a, b = torch.broadcast_tensors(x, y) >>> a.size() torch.Size([2, 3]) >>> a tensor([[0, 1, 2], [0, 1, 2]]) """ if not torch.jit.is_scripting(): if any(type(t) is not Tensor for t in tensors) and has_torch_function(tensors): return handle_torch_function(broadcast_tensors, tensors, *tensors) return _VF.broadcast_tensors(tensors) # type: ignore def split(tensor, split_size_or_sections, dim=0): r"""Splits the tensor into chunks. Each chunk is a view of the original tensor. If :attr:`split_size_or_sections` is an integer type, then :attr:`tensor` will be split into equally sized chunks (if possible). Last chunk will be smaller if the tensor size along the given dimension :attr:`dim` is not divisible by :attr:`split_size`. If :attr:`split_size_or_sections` is a list, then :attr:`tensor` will be split into ``len(split_size_or_sections)`` chunks with sizes in :attr:`dim` according to :attr:`split_size_or_sections`. Arguments: tensor (Tensor): tensor to split. split_size_or_sections (int) or (list(int)): size of a single chunk or list of sizes for each chunk dim (int): dimension along which to split the tensor. Example:: >>> a = torch.arange(10).reshape(5,2) >>> a tensor([[0, 1], [2, 3], [4, 5], [6, 7], [8, 9]]) >>> torch.split(a, 2) (tensor([[0, 1], [2, 3]]), tensor([[4, 5], [6, 7]]), tensor([[8, 9]])) >>> torch.split(a, [1,4]) (tensor([[0, 1]]), tensor([[2, 3], [4, 5], [6, 7], [8, 9]])) """ if not torch.jit.is_scripting(): if type(tensor) is not Tensor and has_torch_function((tensor,)): return handle_torch_function(split, (tensor,), tensor, split_size_or_sections, dim=dim) # Overwriting reason: # This dispatches to two ATen functions depending on the type of # split_size_or_sections. The branching code is in tensor.py, which we # call here. return tensor.split(split_size_or_sections, dim) if TYPE_CHECKING: _Indices = _size else: _Indices = List[int] # equivalent to itertools.product(indices) def _indices_product(indices: _Indices) -> List[List[int]]: empty_list = torch.jit.annotate(List[int], []) result = [empty_list] for idx in indices: result_temp = torch.jit.annotate(List[List[int]], []) for res in result: for i in range(idx): result_temp.append(res + [i]) result = result_temp return result def _index_tensor_with_indices_list(tensor, indices): # type: (Tensor, List[int]) -> Tensor out = tensor for index in indices: out = out[index] return out def lu_unpack(LU_data, LU_pivots, unpack_data=True, unpack_pivots=True): # type: (Tensor, Tensor, bool, bool) -> (Tuple[Optional[Tensor], Optional[Tensor], Optional[Tensor]]) r"""Unpacks the data and pivots from a LU factorization of a tensor. Returns a tuple of tensors as ``(the pivots, the L tensor, the U tensor)``. Arguments: LU_data (Tensor): the packed LU factorization data LU_pivots (Tensor): the packed LU factorization pivots unpack_data (bool): flag indicating if the data should be unpacked unpack_pivots (bool): flag indicating if the pivots should be unpacked Examples:: >>> A = torch.randn(2, 3, 3) >>> A_LU, pivots = A.lu() >>> P, A_L, A_U = torch.lu_unpack(A_LU, pivots) >>> >>> # can recover A from factorization >>> A_ = torch.bmm(P, torch.bmm(A_L, A_U)) >>> # LU factorization of a rectangular matrix: >>> A = torch.randn(2, 3, 2) >>> A_LU, pivots = A.lu() >>> P, A_L, A_U = torch.lu_unpack(A_LU, pivots) >>> P tensor([[[1., 0., 0.], [0., 1., 0.], [0., 0., 1.]], [[0., 0., 1.], [0., 1., 0.], [1., 0., 0.]]]) >>> A_L tensor([[[ 1.0000, 0.0000], [ 0.4763, 1.0000], [ 0.3683, 0.1135]], [[ 1.0000, 0.0000], [ 0.2957, 1.0000], [-0.9668, -0.3335]]]) >>> A_U tensor([[[ 2.1962, 1.0881], [ 0.0000, -0.8681]], [[-1.0947, 0.3736], [ 0.0000, 0.5718]]]) >>> A_ = torch.bmm(P, torch.bmm(A_L, A_U)) >>> torch.norm(A_ - A) tensor(2.9802e-08) """ if not torch.jit.is_scripting(): tens_ops = (LU_data, LU_pivots) if any([type(t) is not Tensor for t in tens_ops]) and has_torch_function(tens_ops): return handle_torch_function( lu_unpack, tens_ops, LU_data, LU_pivots, unpack_data=unpack_data, unpack_pivots=unpack_pivots) shape = LU_data.shape # In generalized LU factorization, the following shape relations hold: # A.shape[-2:] == (m, n) # P.shape[-2:] == (m, m) # L.shape[-2:] == (m, k) # U.shape[-2:] == (k, n) # where k = min(m, n) m, n = shape[-2:] k = min(m, n) if unpack_data: U: Optional[Tensor] = LU_data.triu() assert U is not None if m != k: U = U.narrow(-2, 0, k) L: Optional[Tensor] = LU_data.tril() assert L is not None if k != n: L = L.narrow(-1, 0, k) L.diagonal(dim1=-2, dim2=-1).fill_(1) else: L = U = None if unpack_pivots: LU_pivots_zero_idx = LU_pivots - 1 if LU_data.dim() > 2: P: Optional[Tensor] = torch.eye(m, device=LU_data.device, dtype=LU_data.dtype) \ .expand(shape[:-1] + (m,)) \ .clone(memory_format=torch.contiguous_format) assert P is not None # TODO: rewrite when TorchScript supports product and map as # product(*map(lambda x: list(range(x)), shape[:-2])) when issue 33781 is fixed indices = _indices_product(shape[:-2]) for idx in indices: final_order = [i for i in range(m)] # noqa: C416 TODO: rewrite as list(range(m)) for k, j in enumerate(_index_tensor_with_indices_list(LU_pivots_zero_idx, idx)): final_order[k], final_order[j] = final_order[j], final_order[k] # TODO: remove _index_tensor_with_indices_list when TorchScript supports indexing Tensor with list p_idx = _index_tensor_with_indices_list(P, idx) p_idx.copy_(p_idx.index_select(1, torch.as_tensor(final_order, device=LU_pivots.device))) else: P = torch.eye(m, device=LU_data.device, dtype=LU_data.dtype) final_order = [i for i in range(m)] # noqa: C416 TODO: rewrite as list(range(m)) for k, j, in enumerate(LU_pivots_zero_idx): final_order[k], final_order[j] = final_order[j], final_order[k] P = P.index_select(1, torch.as_tensor(final_order, device=LU_pivots.device)) else: P = None return P, L, U def einsum(equation, *operands): r"""einsum(equation, *operands) -> Tensor This function provides a way of computing multilinear expressions (i.e. sums of products) using the Einstein summation convention. Args: equation (string): The equation is given in terms of lower case letters (indices) to be associated with each dimension of the operands and result. The left hand side lists the operands dimensions, separated by commas. There should be one index letter per tensor dimension. The right hand side follows after `->` and gives the indices for the output. If the `->` and right hand side are omitted, it implicitly defined as the alphabetically sorted list of all indices appearing exactly once in the left hand side. The indices not apprearing in the output are summed over after multiplying the operands entries. If an index appears several times for the same operand, a diagonal is taken. Ellipses `...` represent a fixed number of dimensions. If the right hand side is inferred, the ellipsis dimensions are at the beginning of the output. operands (Tensor): The operands to compute the Einstein sum of. .. note:: This function does not optimize the given expression, so a different formula for the same computation may run faster or consume less memory. Projects like opt_einsum (https://optimized-einsum.readthedocs.io/en/stable/) can optimize the formula for you. Examples:: >>> x = torch.randn(5) >>> y = torch.randn(4) >>> torch.einsum('i,j->ij', x, y) # outer product tensor([[-0.0570, -0.0286, -0.0231, 0.0197], [ 1.2616, 0.6335, 0.5113, -0.4351], [ 1.4452, 0.7257, 0.5857, -0.4984], [-0.4647, -0.2333, -0.1883, 0.1603], [-1.1130, -0.5588, -0.4510, 0.3838]]) >>> A = torch.randn(3,5,4) >>> l = torch.randn(2,5) >>> r = torch.randn(2,4) >>> torch.einsum('bn,anm,bm->ba', l, A, r) # compare torch.nn.functional.bilinear tensor([[-0.3430, -5.2405, 0.4494], [ 0.3311, 5.5201, -3.0356]]) >>> As = torch.randn(3,2,5) >>> Bs = torch.randn(3,5,4) >>> torch.einsum('bij,bjk->bik', As, Bs) # batch matrix multiplication tensor([[[-1.0564, -1.5904, 3.2023, 3.1271], [-1.6706, -0.8097, -0.8025, -2.1183]], [[ 4.2239, 0.3107, -0.5756, -0.2354], [-1.4558, -0.3460, 1.5087, -0.8530]], [[ 2.8153, 1.8787, -4.3839, -1.2112], [ 0.3728, -2.1131, 0.0921, 0.8305]]]) >>> A = torch.randn(3, 3) >>> torch.einsum('ii->i', A) # diagonal tensor([-0.7825, 0.8291, -0.1936]) >>> A = torch.randn(4, 3, 3) >>> torch.einsum('...ii->...i', A) # batch diagonal tensor([[-1.0864, 0.7292, 0.0569], [-0.9725, -1.0270, 0.6493], [ 0.5832, -1.1716, -1.5084], [ 0.4041, -1.1690, 0.8570]]) >>> A = torch.randn(2, 3, 4, 5) >>> torch.einsum('...ij->...ji', A).shape # batch permute torch.Size([2, 3, 5, 4]) """ if not torch.jit.is_scripting(): if any(type(t) is not Tensor for t in operands) and has_torch_function(operands): return handle_torch_function(einsum, operands, equation, *operands) if len(operands) == 1 and isinstance(operands[0], (list, tuple)): # the old interface of passing the operands as one list argument _operands = operands[0] # recurse incase operands contains value that has torch function # in the original implementation this line is omitted return einsum(equation, *_operands) return _VF.einsum(equation, operands) # type: ignore if TYPE_CHECKING: # The JIT doesn't understand Union, so only add type annotation for mypy def meshgrid(*tensors: Union[Tensor, List[Tensor]]) -> Tuple[Tensor, ...]: return _meshgrid(*tensors) else: def meshgrid(*tensors): return _meshgrid(*tensors) def _meshgrid(*tensors): r"""Take :math:`N` tensors, each of which can be either scalar or 1-dimensional vector, and create :math:`N` N-dimensional grids, where the :math:`i` :sup:`th` grid is defined by expanding the :math:`i` :sup:`th` input over dimensions defined by other inputs. Args: tensors (list of Tensor): list of scalars or 1 dimensional tensors. Scalars will be treated as tensors of size :math:`(1,)` automatically Returns: seq (sequence of Tensors): If the input has :math:`k` tensors of size :math:`(N_1,), (N_2,), \ldots , (N_k,)`, then the output would also have :math:`k` tensors, where all tensors are of size :math:`(N_1, N_2, \ldots , N_k)`. Example:: >>> x = torch.tensor([1, 2, 3]) >>> y = torch.tensor([4, 5, 6]) >>> grid_x, grid_y = torch.meshgrid(x, y) >>> grid_x tensor([[1, 1, 1], [2, 2, 2], [3, 3, 3]]) >>> grid_y tensor([[4, 5, 6], [4, 5, 6], [4, 5, 6]]) """ if not torch.jit.is_scripting(): if any(type(t) is not Tensor for t in tensors) and has_torch_function(tensors): return handle_torch_function(meshgrid, tensors, *tensors) if len(tensors) == 1 and isinstance(tensors[0], (list, tuple)): # the old interface of passing the operands as one list argument tensors = tensors[0] # type: ignore return _VF.meshgrid(tensors) # type: ignore def stft(input: Tensor, n_fft: int, hop_length: Optional[int] = None, win_length: Optional[int] = None, window: Optional[Tensor] = None, center: bool = True, pad_mode: str = 'reflect', normalized: bool = False, onesided: Optional[bool] = None, return_complex: Optional[bool] = None) -> Tensor: r"""Short-time Fourier transform (STFT). .. warning:: Setting :attr:`return_complex` explicitly will be required in a future PyTorch release. Set it to False to preserve the current behavior or True to return a complex output. The STFT computes the Fourier transform of short overlapping windows of the input. This giving frequency components of the signal as they change over time. The interface of this function is modeled after the librosa_ stft function. .. _librosa: https://librosa.org/doc/latest/generated/librosa.stft.html Ignoring the optional batch dimension, this method computes the following expression: .. math:: X[m, \omega] = \sum_{k = 0}^{\text{win\_length-1}}% \text{window}[k]\ \text{input}[m \times \text{hop\_length} + k]\ % \exp\left(- j \frac{2 \pi \cdot \omega k}{\text{win\_length}}\right), where :math:`m` is the index of the sliding window, and :math:`\omega` is the frequency that :math:`0 \leq \omega < \text{n\_fft}`. When :attr:`onesided` is the default value ``True``, * :attr:`input` must be either a 1-D time sequence or a 2-D batch of time sequences. * If :attr:`hop_length` is ``None`` (default), it is treated as equal to ``floor(n_fft / 4)``. * If :attr:`win_length` is ``None`` (default), it is treated as equal to :attr:`n_fft`. * :attr:`window` can be a 1-D tensor of size :attr:`win_length`, e.g., from :meth:`torch.hann_window`. If :attr:`window` is ``None`` (default), it is treated as if having :math:`1` everywhere in the window. If :math:`\text{win\_length} < \text{n\_fft}`, :attr:`window` will be padded on both sides to length :attr:`n_fft` before being applied. * If :attr:`center` is ``True`` (default), :attr:`input` will be padded on both sides so that the :math:`t`-th frame is centered at time :math:`t \times \text{hop\_length}`. Otherwise, the :math:`t`-th frame begins at time :math:`t \times \text{hop\_length}`. * :attr:`pad_mode` determines the padding method used on :attr:`input` when :attr:`center` is ``True``. See :meth:`torch.nn.functional.pad` for all available options. Default is ``"reflect"``. * If :attr:`onesided` is ``True`` (default for real input), only values for :math:`\omega` in :math:`\left[0, 1, 2, \dots, \left\lfloor \frac{\text{n\_fft}}{2} \right\rfloor + 1\right]` are returned because the real-to-complex Fourier transform satisfies the conjugate symmetry, i.e., :math:`X[m, \omega] = X[m, \text{n\_fft} - \omega]^*`. Note if the input or window tensors are complex, then :attr:`onesided` output is not possible. * If :attr:`normalized` is ``True`` (default is ``False``), the function returns the normalized STFT results, i.e., multiplied by :math:`(\text{frame\_length})^{-0.5}`. * If :attr:`return_complex` is ``True`` (default if input is complex), the return is a ``input.dim() + 1`` dimensional complex tensor. If ``False``, the output is a ``input.dim() + 2`` dimensional real tensor where the last dimension represents the real and imaginary components. Returns either a complex tensor of size :math:`(* \times N \times T)` if :attr:`return_complex` is true, or a real tensor of size :math:`(* \times N \times T \times 2)`. Where :math:`*` is the optional batch size of :attr:`input`, :math:`N` is the number of frequencies where STFT is applied and :math:`T` is the total number of frames used. .. warning:: This function changed signature at version 0.4.1. Calling with the previous signature may cause error or return incorrect result. Arguments: input (Tensor): the input tensor n_fft (int): size of Fourier transform hop_length (int, optional): the distance between neighboring sliding window frames. Default: ``None`` (treated as equal to ``floor(n_fft / 4)``) win_length (int, optional): the size of window frame and STFT filter. Default: ``None`` (treated as equal to :attr:`n_fft`) window (Tensor, optional): the optional window function. Default: ``None`` (treated as window of all :math:`1` s) center (bool, optional): whether to pad :attr:`input` on both sides so that the :math:`t`-th frame is centered at time :math:`t \times \text{hop\_length}`. Default: ``True`` pad_mode (string, optional): controls the padding method used when :attr:`center` is ``True``. Default: ``"reflect"`` normalized (bool, optional): controls whether to return the normalized STFT results Default: ``False`` onesided (bool, optional): controls whether to return half of results to avoid redundancy for real inputs. Default: ``True`` for real :attr:`input` and :attr:`window`, ``False`` otherwise. return_complex (bool, optional): whether to return a complex tensor, or a real tensor with an extra last dimension for the real and imaginary components. Returns: Tensor: A tensor containing the STFT result with shape described above """ if not torch.jit.is_scripting(): if type(input) is not Tensor and has_torch_function((input,)): return handle_torch_function( stft, (input,), input, n_fft, hop_length=hop_length, win_length=win_length, window=window, center=center, pad_mode=pad_mode, normalized=normalized, onesided=onesided, return_complex=return_complex) # TODO: after having proper ways to map Python strings to ATen Enum, move # this and F.pad to ATen. if center: signal_dim = input.dim() extended_shape = [1] * (3 - signal_dim) + list(input.size()) pad = int(n_fft // 2) input = F.pad(input.view(extended_shape), (pad, pad), pad_mode) input = input.view(input.shape[-signal_dim:]) # STFT patch for aarch64 # https://stackoverflow.com/a/66872148 librosa_stft = librosa.stft(input.cpu().detach().numpy().reshape(-1), n_fft, hop_length, win_length, window="hann", center=center, pad_mode=pad_mode) librosa_stft = np.array([[a.real, a.imag] for a in librosa_stft]) librosa_stft = np.transpose(librosa_stft, axes=[0, 2, 1]) librosa_stft = np.expand_dims(librosa_stft, 0) librosa_stft = torch.from_numpy(librosa_stft) return librosa_stft #return _VF.stft(input, n_fft, hop_length, win_length, window, # type: ignore # normalized, onesided, return_complex) def istft(input: Tensor, n_fft: int, hop_length: Optional[int] = None, win_length: Optional[int] = None, window: Optional[Tensor] = None, center: bool = True, normalized: bool = False, onesided: Optional[bool] = None, length: Optional[int] = None, return_complex: bool = False) -> Tensor: r"""Inverse short time Fourier Transform. This is expected to be the inverse of :func:`~torch.stft`. It has the same parameters (+ additional optional parameter of :attr:`length`) and it should return the least squares estimation of the original signal. The algorithm will check using the NOLA condition ( nonzero overlap). Important consideration in the parameters :attr:`window` and :attr:`center` so that the envelop created by the summation of all the windows is never zero at certain point in time. Specifically, :math:`\sum_{t=-\infty}^{\infty} |w|^2[n-t\times hop\_length] \cancel{=} 0`. Since :func:`~torch.stft` discards elements at the end of the signal if they do not fit in a frame, ``istft`` may return a shorter signal than the original signal (can occur if :attr:`center` is False since the signal isn't padded). If :attr:`center` is ``True``, then there will be padding e.g. ``'constant'``, ``'reflect'``, etc. Left padding can be trimmed off exactly because they can be calculated but right padding cannot be calculated without additional information. Example: Suppose the last window is: ``[17, 18, 0, 0, 0]`` vs ``[18, 0, 0, 0, 0]`` The :attr:`n_fft`, :attr:`hop_length`, :attr:`win_length` are all the same which prevents the calculation of right padding. These additional values could be zeros or a reflection of the signal so providing :attr:`length` could be useful. If :attr:`length` is ``None`` then padding will be aggressively removed (some loss of signal). [1] D. W. Griffin and J. S. Lim, "Signal estimation from modified short-time Fourier transform," IEEE Trans. ASSP, vol.32, no.2, pp.236-243, Apr. 1984. Arguments: input (Tensor): The input tensor. Expected to be output of :func:`~torch.stft`, can either be complex (``channel``, ``fft_size``, ``n_frame``), or real (``channel``, ``fft_size``, ``n_frame``, 2) where the ``channel`` dimension is optional. n_fft (int): Size of Fourier transform hop_length (Optional[int]): The distance between neighboring sliding window frames. (Default: ``n_fft // 4``) win_length (Optional[int]): The size of window frame and STFT filter. (Default: ``n_fft``) window (Optional[torch.Tensor]): The optional window function. (Default: ``torch.ones(win_length)``) center (bool): Whether :attr:`input` was padded on both sides so that the :math:`t`-th frame is centered at time :math:`t \times \text{hop\_length}`. (Default: ``True``) normalized (bool): Whether the STFT was normalized. (Default: ``False``) onesided (Optional[bool]): Whether the STFT was onesided. (Default: ``True`` if ``n_fft != fft_size`` in the input size) length (Optional[int]): The amount to trim the signal by (i.e. the original signal length). (Default: whole signal) return_complex (Optional[bool]): Whether the output should be complex, or if the input should be assumed to derive from a real signal and window. Note that this is incompatible with ``onesided=True``. (Default: ``False``) Returns: Tensor: Least squares estimation of the original signal of size (..., signal_length) """ if not torch.jit.is_scripting(): if type(input) is not Tensor and has_torch_function((input,)): return handle_torch_function( istft, (input,), input, n_fft, hop_length=hop_length, win_length=win_length, window=window, center=center, normalized=normalized, onesided=onesided, length=length, return_complex=return_complex) return _VF.istft(input, n_fft, hop_length, win_length, window, center, # type: ignore normalized, onesided, length, return_complex) del torch.unique_dim if TYPE_CHECKING: # These _impl functions return a variable number of tensors as output with # __torch_function__; tuple unpacking is done already rather than being # done by the caller of the _impl function _unique_impl_out = Any else: _unique_impl_out = Tuple[Tensor, Tensor, Tensor] def _unique_impl(input: Tensor, sorted: bool = True, return_inverse: bool = False, return_counts: bool = False, dim: Optional[int] = None) -> _unique_impl_out: r"""Returns the unique elements of the input tensor. .. note:: This function is different from :func:`torch.unique_consecutive` in the sense that this function also eliminates non-consecutive duplicate values. .. note:: Currently in the CUDA implementation and the CPU implementation when dim is specified, `torch.unique` always sort the tensor at the beginning regardless of the `sort` argument. Sorting could be slow, so if your input tensor is already sorted, it is recommended to use :func:`torch.unique_consecutive` which avoids the sorting. Arguments: input (Tensor): the input tensor sorted (bool): Whether to sort the unique elements in ascending order before returning as output. return_inverse (bool): Whether to also return the indices for where elements in the original input ended up in the returned unique list. return_counts (bool): Whether to also return the counts for each unique element. dim (int): the dimension to apply unique. If ``None``, the unique of the flattened input is returned. default: ``None`` Returns: (Tensor, Tensor (optional), Tensor (optional)): A tensor or a tuple of tensors containing - **output** (*Tensor*): the output list of unique scalar elements. - **inverse_indices** (*Tensor*): (optional) if :attr:`return_inverse` is True, there will be an additional returned tensor (same shape as input) representing the indices for where elements in the original input map to in the output; otherwise, this function will only return a single tensor. - **counts** (*Tensor*): (optional) if :attr:`return_counts` is True, there will be an additional returned tensor (same shape as output or output.size(dim), if dim was specified) representing the number of occurrences for each unique value or tensor. Example:: >>> output = torch.unique(torch.tensor([1, 3, 2, 3], dtype=torch.long)) >>> output tensor([ 2, 3, 1]) >>> output, inverse_indices = torch.unique( torch.tensor([1, 3, 2, 3], dtype=torch.long), sorted=True, return_inverse=True) >>> output tensor([ 1, 2, 3]) >>> inverse_indices tensor([ 0, 2, 1, 2]) >>> output, inverse_indices = torch.unique( torch.tensor([[1, 3], [2, 3]], dtype=torch.long), sorted=True, return_inverse=True) >>> output tensor([ 1, 2, 3]) >>> inverse_indices tensor([[ 0, 2], [ 1, 2]]) """ if not torch.jit.is_scripting(): if type(input) is not Tensor and has_torch_function((input,)): return handle_torch_function( unique, (input,), input, sorted=sorted, return_inverse=return_inverse, return_counts=return_counts, dim=dim) if dim is not None: output, inverse_indices, counts = _VF.unique_dim( # type: ignore input, dim, sorted=sorted, return_inverse=return_inverse, return_counts=return_counts, ) else: output, inverse_indices, counts = torch._unique2( input, sorted=sorted, return_inverse=return_inverse, return_counts=return_counts, ) return output, inverse_indices, counts def _unique_consecutive_impl(input: Tensor, return_inverse: bool = False, return_counts: bool = False, dim: Optional[int] = None) -> _unique_impl_out: r"""Eliminates all but the first element from every consecutive group of equivalent elements. .. note:: This function is different from :func:`torch.unique` in the sense that this function only eliminates consecutive duplicate values. This semantics is similar to `std::unique` in C++. Arguments: input (Tensor): the input tensor return_inverse (bool): Whether to also return the indices for where elements in the original input ended up in the returned unique list. return_counts (bool): Whether to also return the counts for each unique element. dim (int): the dimension to apply unique. If ``None``, the unique of the flattened input is returned. default: ``None`` Returns: (Tensor, Tensor (optional), Tensor (optional)): A tensor or a tuple of tensors containing - **output** (*Tensor*): the output list of unique scalar elements. - **inverse_indices** (*Tensor*): (optional) if :attr:`return_inverse` is True, there will be an additional returned tensor (same shape as input) representing the indices for where elements in the original input map to in the output; otherwise, this function will only return a single tensor. - **counts** (*Tensor*): (optional) if :attr:`return_counts` is True, there will be an additional returned tensor (same shape as output or output.size(dim), if dim was specified) representing the number of occurrences for each unique value or tensor. Example:: >>> x = torch.tensor([1, 1, 2, 2, 3, 1, 1, 2]) >>> output = torch.unique_consecutive(x) >>> output tensor([1, 2, 3, 1, 2]) >>> output, inverse_indices = torch.unique_consecutive(x, return_inverse=True) >>> output tensor([1, 2, 3, 1, 2]) >>> inverse_indices tensor([0, 0, 1, 1, 2, 3, 3, 4]) >>> output, counts = torch.unique_consecutive(x, return_counts=True) >>> output tensor([1, 2, 3, 1, 2]) >>> counts tensor([2, 2, 1, 2, 1]) """ if not torch.jit.is_scripting(): if type(input) is not Tensor and has_torch_function((input,)): return handle_torch_function( unique_consecutive, (input,), input, return_inverse=return_inverse, return_counts=return_counts, dim=dim) output, inverse_indices, counts = _VF.unique_consecutive( # type: ignore input, return_inverse=return_inverse, return_counts=return_counts, dim=dim) return output, inverse_indices, counts def _return_counts(input, sorted=True, return_inverse=False, return_counts=False, dim=None): # type: (Tensor, bool, bool, bool, Optional[int]) -> Tuple[Tensor, Tensor] if not torch.jit.is_scripting(): if type(input) is not Tensor and has_torch_function((input,)): return _unique_impl(input, sorted, return_inverse, return_counts, dim) output, _, counts = _unique_impl(input, sorted, return_inverse, return_counts, dim) return output, counts def _return_output(input, sorted=True, return_inverse=False, return_counts=False, dim=None): # type: (Tensor, bool, bool, bool, Optional[int]) -> Tensor if not torch.jit.is_scripting(): if type(input) is not Tensor and has_torch_function((input,)): return _unique_impl(input, sorted, return_inverse, return_counts, dim) output, _, _ = _unique_impl(input, sorted, return_inverse, return_counts, dim) return output def _return_inverse(input, sorted=True, return_inverse=False, return_counts=False, dim=None): # type: (Tensor, bool, bool, bool, Optional[int]) -> Tuple[Tensor, Tensor] if not torch.jit.is_scripting(): if type(input) is not Tensor and has_torch_function((input,)): return _unique_impl(input, sorted, return_inverse, return_counts, dim) output, inverse_indices, _ = _unique_impl(input, sorted, return_inverse, return_counts, dim) return output, inverse_indices _return_inverse_false = boolean_dispatch( arg_name='return_counts', arg_index=3, default=False, if_true=_return_counts, if_false=_return_output, module_name=__name__, func_name='unique') _return_inverse_true = boolean_dispatch( arg_name='return_counts', arg_index=3, default=False, if_true=_unique_impl, if_false=_return_inverse, module_name=__name__, func_name='unique') # The return type of unique depends on `return_inverse`, and `return_counts` so in order to # resolve the output type in TorchScript we need to statically know the value of both parameters unique = boolean_dispatch( arg_name='return_inverse', arg_index=2, default=False, if_true=_return_inverse_true, if_false=_return_inverse_false, module_name=__name__, func_name='unique') unique.__doc__ = _unique_impl.__doc__ def _consecutive_return_counts(input, return_inverse=False, return_counts=False, dim=None): # type: (Tensor, bool, bool, Optional[int]) -> Tuple[Tensor, Tensor] if not torch.jit.is_scripting(): if type(input) is not Tensor and has_torch_function((input,)): return _unique_consecutive_impl(input, return_inverse, return_counts, dim) output, _, counts = _unique_consecutive_impl(input, return_inverse, return_counts, dim) return output, counts def _consecutive_return_output(input, return_inverse=False, return_counts=False, dim=None): # type: (Tensor, bool, bool, Optional[int]) -> Tensor if not torch.jit.is_scripting(): if type(input) is not Tensor and has_torch_function((input,)): return _unique_consecutive_impl(input, return_inverse, return_counts, dim) output, _, _ = _unique_consecutive_impl(input, return_inverse, return_counts, dim) return output def _consecutive_return_inverse(input, return_inverse=False, return_counts=False, dim=None): # type: (Tensor, bool, bool, Optional[int]) -> Tuple[Tensor, Tensor] if not torch.jit.is_scripting(): if type(input) is not Tensor and has_torch_function((input,)): return _unique_consecutive_impl(input, return_inverse, return_counts, dim) output, inverse_indices, _ = _unique_consecutive_impl(input, return_inverse, return_counts, dim) return output, inverse_indices _consecutive_return_inverse_false = boolean_dispatch( arg_name='return_counts', arg_index=1, default=False, if_true=_consecutive_return_counts, if_false=_consecutive_return_output, module_name=__name__, func_name='unique_consecutive') _consecutive_return_inverse_true = boolean_dispatch( arg_name='return_counts', arg_index=1, default=False, if_true=_unique_consecutive_impl, if_false=_consecutive_return_inverse, module_name=__name__, func_name='unique_consecutive') # The return type of unique depends on `return_inverse`, and `return_counts` so in order to # resolve the output type in TorchScript we need to statically know the value of both parameters unique_consecutive = boolean_dispatch( arg_name='return_inverse', arg_index=2, default=False, if_true=_consecutive_return_inverse_true, if_false=_consecutive_return_inverse_false, module_name=__name__, func_name='unique_consecutive') unique_consecutive.__doc__ = _unique_consecutive_impl.__doc__ def tensordot(a, b, dims=2): r"""Returns a contraction of a and b over multiple dimensions. :attr:`tensordot` implements a generalized matrix product. Args: a (Tensor): Left tensor to contract b (Tensor): Right tensor to contract dims (int or tuple of two lists of integers): number of dimensions to contract or explicit lists of dimensions for :attr:`a` and :attr:`b` respectively When called with a non-negative integer argument :attr:`dims` = :math:`d`, and the number of dimensions of :attr:`a` and :attr:`b` is :math:`m` and :math:`n`, respectively, :func:`~torch.tensordot` computes .. math:: r_{i_0,...,i_{m-d}, i_d,...,i_n} = \sum_{k_0,...,k_{d-1}} a_{i_0,...,i_{m-d},k_0,...,k_{d-1}} \times b_{k_0,...,k_{d-1}, i_d,...,i_n}. When called with :attr:`dims` of the list form, the given dimensions will be contracted in place of the last :math:`d` of :attr:`a` and the first :math:`d` of :math:`b`. The sizes in these dimensions must match, but :func:`~torch.tensordot` will deal with broadcasted dimensions. Examples:: >>> a = torch.arange(60.).reshape(3, 4, 5) >>> b = torch.arange(24.).reshape(4, 3, 2) >>> torch.tensordot(a, b, dims=([1, 0], [0, 1])) tensor([[4400., 4730.], [4532., 4874.], [4664., 5018.], [4796., 5162.], [4928., 5306.]]) >>> a = torch.randn(3, 4, 5, device='cuda') >>> b = torch.randn(4, 5, 6, device='cuda') >>> c = torch.tensordot(a, b, dims=2).cpu() tensor([[ 8.3504, -2.5436, 6.2922, 2.7556, -1.0732, 3.2741], [ 3.3161, 0.0704, 5.0187, -0.4079, -4.3126, 4.8744], [ 0.8223, 3.9445, 3.2168, -0.2400, 3.4117, 1.7780]]) """ if not torch.jit.is_scripting(): if (type(a) is not Tensor or type(b) is not Tensor) and has_torch_function((a, b)): return handle_torch_function(tensordot, (a, b), a, b, dims=dims) if isinstance(dims, (list, tuple)) or \ (isinstance(dims, torch.Tensor) and dims.numel() > 1): dims_a, dims_b = dims else: if isinstance(dims, torch.Tensor): dims = dims.item() if dims < 0: raise RuntimeError(f"tensordot expects dims >= 0, but got dims={dims}") dims_a = list(range(-dims, 0)) dims_b = list(range(dims)) return _VF.tensordot(a, b, dims_a, dims_b) # type: ignore def cartesian_prod(*tensors): """Do cartesian product of the given sequence of tensors. The behavior is similar to python's `itertools.product`. Arguments: *tensors: any number of 1 dimensional tensors. Returns: Tensor: A tensor equivalent to converting all the input tensors into lists, do `itertools.product` on these lists, and finally convert the resulting list into tensor. Example:: >>> a = [1, 2, 3] >>> b = [4, 5] >>> list(itertools.product(a, b)) [(1, 4), (1, 5), (2, 4), (2, 5), (3, 4), (3, 5)] >>> tensor_a = torch.tensor(a) >>> tensor_b = torch.tensor(b) >>> torch.cartesian_prod(tensor_a, tensor_b) tensor([[1, 4], [1, 5], [2, 4], [2, 5], [3, 4], [3, 5]]) """ if not torch.jit.is_scripting(): if any(type(t) is not Tensor for t in tensors) and has_torch_function(tensors): return handle_torch_function(cartesian_prod, tensors, *tensors) return _VF.cartesian_prod(tensors) # type: ignore def block_diag(*tensors): """Create a block diagonal matrix from provided tensors. Arguments: *tensors: One or more tensors with 0, 1, or 2 dimensions. Returns: Tensor: A 2 dimensional tensor with all the input tensors arranged in order such that their upper left and lower right corners are diagonally adjacent. All other elements are set to 0. Example:: >>> import torch >>> A = torch.tensor([[0, 1], [1, 0]]) >>> B = torch.tensor([[3, 4, 5], [6, 7, 8]]) >>> C = torch.tensor(7) >>> D = torch.tensor([1, 2, 3]) >>> E = torch.tensor([[4], [5], [6]]) >>> torch.block_diag(A, B, C, D, E) tensor([[0, 1, 0, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 3, 4, 5, 0, 0, 0, 0, 0], [0, 0, 6, 7, 8, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 7, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 1, 2, 3, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 4], [0, 0, 0, 0, 0, 0, 0, 0, 0, 5], [0, 0, 0, 0, 0, 0, 0, 0, 0, 6]]) """ if any(type(t) is not Tensor for t in tensors) and has_torch_function(tensors): return handle_torch_function(block_diag, tensors, *tensors) return torch._C._VariableFunctions.block_diag(tensors) # type: ignore def cdist(x1, x2, p=2., compute_mode='use_mm_for_euclid_dist_if_necessary'): # type: (Tensor, Tensor, float, str) -> (Tensor) r"""Computes batched the p-norm distance between each pair of the two collections of row vectors. Args: x1 (Tensor): input tensor of shape :math:`B \times P \times M`. x2 (Tensor): input tensor of shape :math:`B \times R \times M`. p: p value for the p-norm distance to calculate between each vector pair :math:`\in [0, \infty]`. compute_mode: 'use_mm_for_euclid_dist_if_necessary' - will use matrix multiplication approach to calculate euclidean distance (p = 2) if P > 25 or R > 25 'use_mm_for_euclid_dist' - will always use matrix multiplication approach to calculate euclidean distance (p = 2) 'donot_use_mm_for_euclid_dist' - will never use matrix multiplication approach to calculate euclidean distance (p = 2) Default: use_mm_for_euclid_dist_if_necessary. If x1 has shape :math:`B \times P \times M` and x2 has shape :math:`B \times R \times M` then the output will have shape :math:`B \times P \times R`. This function is equivalent to `scipy.spatial.distance.cdist(input,'minkowski', p=p)` if :math:`p \in (0, \infty)`. When :math:`p = 0` it is equivalent to `scipy.spatial.distance.cdist(input, 'hamming') * M`. When :math:`p = \infty`, the closest scipy function is `scipy.spatial.distance.cdist(xn, lambda x, y: np.abs(x - y).max())`. Example: >>> a = torch.tensor([[0.9041, 0.0196], [-0.3108, -2.4423], [-0.4821, 1.059]]) >>> a tensor([[ 0.9041, 0.0196], [-0.3108, -2.4423], [-0.4821, 1.0590]]) >>> b = torch.tensor([[-2.1763, -0.4713], [-0.6986, 1.3702]]) >>> b tensor([[-2.1763, -0.4713], [-0.6986, 1.3702]]) >>> torch.cdist(a, b, p=2) tensor([[3.1193, 2.0959], [2.7138, 3.8322], [2.2830, 0.3791]]) """ if not torch.jit.is_scripting(): if (type(x1) is not Tensor or type(x2) is not Tensor) and has_torch_function((x1, x2)): return handle_torch_function( cdist, (x1, x2), x1, x2, p=p, compute_mode=compute_mode) if compute_mode == 'use_mm_for_euclid_dist_if_necessary': return _VF.cdist(x1, x2, p, None) # type: ignore elif compute_mode == 'use_mm_for_euclid_dist': return _VF.cdist(x1, x2, p, 1) # type: ignore elif compute_mode == 'donot_use_mm_for_euclid_dist': return _VF.cdist(x1, x2, p, 2) # type: ignore else: raise ValueError(f"{compute_mode} is not a valid value for compute_mode") def atleast_1d(*tensors): r""" Returns a 1-dimensional view of each input tensor with zero dimensions. Input tensors with one or more dimensions are returned as-is. Args: input (Tensor or list of Tensors) Returns: output (Tensor or tuple of Tensors) Example:: >>> x = torch.randn(2) >>> x tensor([1.4584, 0.7583]) >>> torch.atleast_1d(x) tensor([1.4584, 0.7583]) >>> x = torch.tensor(1.) >>> x tensor(1.) >>> torch.atleast_1d(x) tensor([1.]) >>> x = torch.tensor(0.5) >>> y = torch.tensor(1.) >>> torch.atleast_1d((x,y)) (tensor([0.5000]), tensor([1.])) """ if not torch.jit.is_scripting(): if any(type(t) is not Tensor for t in tensors) and has_torch_function(tensors): return handle_torch_function(atleast_1d, tensors, *tensors) if len(tensors) == 1: tensors = tensors[0] return _VF.atleast_1d(tensors) # type: ignore def atleast_2d(*tensors): r""" Returns a 2-dimensional view of each each input tensor with zero dimensions. Input tensors with two or more dimensions are returned as-is. Args: input (Tensor or list of Tensors) Returns: output (Tensor or tuple of Tensors) Example:: >>> x = torch.tensor(1.) >>> x tensor(1.) >>> torch.atleast_2d(x) tensor([[1.]]) >>> x = torch.randn(2,2) >>> x tensor([[2.2086, 2.5165], [0.1757, 0.5194]]) >>> torch.atleast_2d(x) tensor([[2.2086, 2.5165], [0.1757, 0.5194]]) >>> x = torch.tensor(0.5) >>> y = torch.tensor(1.) >>> torch.atleast_2d((x,y)) (tensor([[0.5000]]), tensor([[1.]])) """ if not torch.jit.is_scripting(): if any(type(t) is not Tensor for t in tensors) and has_torch_function(tensors): return handle_torch_function(atleast_2d, tensors, *tensors) if len(tensors) == 1: tensors = tensors[0] return _VF.atleast_2d(tensors) # type: ignore def atleast_3d(*tensors): r""" Returns a 3-dimensional view of each each input tensor with zero dimensions. Input tensors with three or more dimensions are returned as-is. Args: input (Tensor or list of Tensors) Returns: output (Tensor or tuple of Tensors) Example: >>> x = torch.tensor(0.5) >>> x tensor(0.5000) >>> torch.atleast_3d(x) tensor([[[0.5000]]]) >>> y = torch.randn(2,2) >>> y tensor([[-0.8079, 0.7460], [-1.1647, 1.4734]]) >>> torch.atleast_3d(y) tensor([[[-0.8079], [ 0.7460]], [[-1.1647], [ 1.4734]]]) >>> x = torch.randn(1,1,1) >>> x tensor([[[-1.5689]]]) >>> torch.atleast_3d(x) tensor([[[-1.5689]]]) >>> x = torch.tensor(0.5) >>> y = torch.tensor(1.) >>> torch.atleast_3d((x,y)) (tensor([[[0.5000]]]), tensor([[[1.]]])) """ if not torch.jit.is_scripting(): if any(type(t) is not Tensor for t in tensors) and has_torch_function(tensors): return handle_torch_function(atleast_3d, tensors, *tensors) if len(tensors) == 1: tensors = tensors[0] return _VF.atleast_3d(tensors) # type: ignore if TYPE_CHECKING: pass # There's no good way to use this type annotation; cannot rename norm() to # _norm_impl() in a way that doesn't break JIT overloads. So leave untyped # for mypy for now. # def norm(input: Tensor, # p: Optional[Union[str, Number]] = "fro", # dim: Optional[Union[int, List[int]]] = None, # keepdim: bool = False, # out: Optional[Tensor] = None, # dtype: _dtype = None) -> Tensor: # return _norm_impl(input, p, dim, keepdim, out, dtype) else: # TODO: type dim as BroadcastingList when # https://github.com/pytorch/pytorch/issues/33782 is fixed @overload # noqa: 749 def norm(input, p="fro", dim=None, keepdim=False, out=None, dtype=None): # noqa: 749 # type: (Tensor, str, Optional[List[int]], bool, Optional[Tensor], Optional[int]) -> Tensor pass @overload # noqa: 749 def norm(input, p="fro", dim=None, keepdim=False, out=None, dtype=None): # noqa: 749 # type: (Tensor, Optional[number], Optional[List[int]], bool, Optional[Tensor], Optional[int]) -> Tensor pass @overload # noqa: 749 def norm(input, p="fro", dim=None, keepdim=False, out=None, dtype=None): # noqa: 749 # type: (Tensor, Optional[number], Optional[int], bool, Optional[Tensor], Optional[int]) -> Tensor pass @overload # noqa: 749 def norm(input, p="fro", dim=None, keepdim=False, out=None, dtype=None): # noqa: 749 # type: (Tensor, str, Optional[int], bool, Optional[Tensor], Optional[int]) -> Tensor pass def norm(input, p="fro", dim=None, keepdim=False, out=None, dtype=None): # noqa: 749 r"""Returns the matrix norm or vector norm of a given tensor. .. warning:: torch.norm is deprecated and may be removed in a future PyTorch release. Use :func:`torch.linalg.norm` instead, but note that :func:`torch.linalg.norm` has a different signature and slightly different behavior that is more consistent with NumPy's numpy.linalg.norm. Args: input (Tensor): the input tensor p (int, float, inf, -inf, 'fro', 'nuc', optional): the order of norm. Default: ``'fro'`` The following norms can be calculated: ===== ============================ ========================== ord matrix norm vector norm ===== ============================ ========================== None Frobenius norm 2-norm 'fro' Frobenius norm -- 'nuc' nuclear norm -- Other as vec norm when dim is None sum(abs(x)**ord)**(1./ord) ===== ============================ ========================== dim (int, 2-tuple of ints, 2-list of ints, optional): If it is an int, vector norm will be calculated, if it is 2-tuple of ints, matrix norm will be calculated. If the value is None, matrix norm will be calculated when the input tensor only has two dimensions, vector norm will be calculated when the input tensor only has one dimension. If the input tensor has more than two dimensions, the vector norm will be applied to last dimension. keepdim (bool, optional): whether the output tensors have :attr:`dim` retained or not. Ignored if :attr:`dim` = ``None`` and :attr:`out` = ``None``. Default: ``False`` out (Tensor, optional): the output tensor. Ignored if :attr:`dim` = ``None`` and :attr:`out` = ``None``. dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor. If specified, the input tensor is casted to :attr:'dtype' while performing the operation. Default: None. Example:: >>> import torch >>> a = torch.arange(9, dtype= torch.float) - 4 >>> b = a.reshape((3, 3)) >>> torch.norm(a) tensor(7.7460) >>> torch.norm(b) tensor(7.7460) >>> torch.norm(a, float('inf')) tensor(4.) >>> torch.norm(b, float('inf')) tensor(4.) >>> c = torch.tensor([[ 1, 2, 3],[-1, 1, 4]] , dtype= torch.float) >>> torch.norm(c, dim=0) tensor([1.4142, 2.2361, 5.0000]) >>> torch.norm(c, dim=1) tensor([3.7417, 4.2426]) >>> torch.norm(c, p=1, dim=1) tensor([6., 6.]) >>> d = torch.arange(8, dtype= torch.float).reshape(2,2,2) >>> torch.norm(d, dim=(1,2)) tensor([ 3.7417, 11.2250]) >>> torch.norm(d[0, :, :]), torch.norm(d[1, :, :]) (tensor(3.7417), tensor(11.2250)) """ if not torch.jit.is_scripting(): if type(input) is not Tensor and has_torch_function((input,)): return handle_torch_function( norm, (input,), input, p=p, dim=dim, keepdim=keepdim, out=out, dtype=dtype) ndim = input.dim() # catch default case if dim is None and out is None and dtype is None and p is not None: if isinstance(p, str): if p == "fro": return _VF.frobenius_norm(input, dim=(), keepdim=keepdim) # type: ignore if not isinstance(p, str): _dim = [i for i in range(ndim)] # noqa: C416 TODO: rewrite as list(range(m)) return _VF.norm(input, p, dim=_dim, keepdim=keepdim) # type: ignore # TODO: when https://github.com/pytorch/pytorch/issues/33782 is fixed # remove the overloads where dim is an int and replace with BraodcastingList1 # and remove next four lines, replace _dim with dim if dim is not None: if isinstance(dim, int): _dim = [dim] else: _dim = dim else: _dim = None # type: ignore if isinstance(p, str): if p == "fro": if dtype is not None: raise ValueError("dtype argument is not supported in frobenius norm") if _dim is None: _dim = [i for i in range(ndim)] # noqa: C416 TODO: rewrite as list(range(m)) if out is None: return _VF.frobenius_norm(input, _dim, keepdim=keepdim) # type: ignore else: return _VF.frobenius_norm(input, _dim, keepdim=keepdim, out=out) # type: ignore elif p == "nuc": if dtype is not None: raise ValueError("dtype argument is not supported in nuclear norm") if _dim is None: if out is None: return _VF.nuclear_norm(input, keepdim=keepdim) # type: ignore else: return _VF.nuclear_norm(input, keepdim=keepdim, out=out) # type: ignore else: if out is None: return _VF.nuclear_norm(input, _dim, keepdim=keepdim) # type: ignore else: return _VF.nuclear_norm(input, _dim, keepdim=keepdim, out=out) # type: ignore raise RuntimeError(f"only valid string values are 'fro' and 'nuc', found {p}") else: if _dim is None: _dim = [i for i in range(ndim)] # noqa: C416 TODO: rewrite as list(range(m)) if out is None: if dtype is None: return _VF.norm(input, p, _dim, keepdim=keepdim) # type: ignore else: return _VF.norm(input, p, _dim, keepdim=keepdim, dtype=dtype) # type: ignore else: if dtype is None: return _VF.norm(input, p, _dim, keepdim=keepdim, out=out) # type: ignore else: return _VF.norm(input, p, _dim, keepdim=keepdim, dtype=dtype, out=out) # type: ignore def chain_matmul(*matrices): r"""Returns the matrix product of the :math:`N` 2-D tensors. This product is efficiently computed using the matrix chain order algorithm which selects the order in which incurs the lowest cost in terms of arithmetic operations (`[CLRS]`_). Note that since this is a function to compute the product, :math:`N` needs to be greater than or equal to 2; if equal to 2 then a trivial matrix-matrix product is returned. If :math:`N` is 1, then this is a no-op - the original matrix is returned as is. Args: matrices (Tensors...): a sequence of 2 or more 2-D tensors whose product is to be determined. Returns: Tensor: if the :math:`i^{th}` tensor was of dimensions :math:`p_{i} \times p_{i + 1}`, then the product would be of dimensions :math:`p_{1} \times p_{N + 1}`. Example:: >>> a = torch.randn(3, 4) >>> b = torch.randn(4, 5) >>> c = torch.randn(5, 6) >>> d = torch.randn(6, 7) >>> torch.chain_matmul(a, b, c, d) tensor([[ -2.3375, -3.9790, -4.1119, -6.6577, 9.5609, -11.5095, -3.2614], [ 21.4038, 3.3378, -8.4982, -5.2457, -10.2561, -2.4684, 2.7163], [ -0.9647, -5.8917, -2.3213, -5.2284, 12.8615, -12.2816, -2.5095]]) .. _`[CLRS]`: https://mitpress.mit.edu/books/introduction-algorithms-third-edition """ if not torch.jit.is_scripting(): if any(type(t) is not Tensor for t in matrices) and has_torch_function(matrices): return handle_torch_function(chain_matmul, matrices, *matrices) return _VF.chain_matmul(matrices) # type: ignore def _lu_impl(A, pivot=True, get_infos=False, out=None): # type: (Tensor, bool, bool, Any) -> Tuple[Tensor, Tensor, Tensor] r"""Computes the LU factorization of a matrix or batches of matrices :attr:`A`. Returns a tuple containing the LU factorization and pivots of :attr:`A`. Pivoting is done if :attr:`pivot` is set to ``True``. .. note:: The pivots returned by the function are 1-indexed. If :attr:`pivot` is ``False``, then the returned pivots is a tensor filled with zeros of the appropriate size. .. note:: LU factorization with :attr:`pivot` = ``False`` is not available for CPU, and attempting to do so will throw an error. However, LU factorization with :attr:`pivot` = ``False`` is available for CUDA. .. note:: This function does not check if the factorization was successful or not if :attr:`get_infos` is ``True`` since the status of the factorization is present in the third element of the return tuple. .. note:: In the case of batches of square matrices with size less or equal to 32 on a CUDA device, the LU factorization is repeated for singular matrices due to the bug in the MAGMA library (see magma issue 13). .. note:: ``L``, ``U``, and ``P`` can be derived using :func:`torch.lu_unpack`. Arguments: A (Tensor): the tensor to factor of size :math:`(*, m, n)` pivot (bool, optional): controls whether pivoting is done. Default: ``True`` get_infos (bool, optional): if set to ``True``, returns an info IntTensor. Default: ``False`` out (tuple, optional): optional output tuple. If :attr:`get_infos` is ``True``, then the elements in the tuple are Tensor, IntTensor, and IntTensor. If :attr:`get_infos` is ``False``, then the elements in the tuple are Tensor, IntTensor. Default: ``None`` Returns: (Tensor, IntTensor, IntTensor (optional)): A tuple of tensors containing - **factorization** (*Tensor*): the factorization of size :math:`(*, m, n)` - **pivots** (*IntTensor*): the pivots of size :math:`(*, m)` - **infos** (*IntTensor*, *optional*): if :attr:`get_infos` is ``True``, this is a tensor of size :math:`(*)` where non-zero values indicate whether factorization for the matrix or each minibatch has succeeded or failed Example:: >>> A = torch.randn(2, 3, 3) >>> A_LU, pivots = torch.lu(A) >>> A_LU tensor([[[ 1.3506, 2.5558, -0.0816], [ 0.1684, 1.1551, 0.1940], [ 0.1193, 0.6189, -0.5497]], [[ 0.4526, 1.2526, -0.3285], [-0.7988, 0.7175, -0.9701], [ 0.2634, -0.9255, -0.3459]]]) >>> pivots tensor([[ 3, 3, 3], [ 3, 3, 3]], dtype=torch.int32) >>> A_LU, pivots, info = torch.lu(A, get_infos=True) >>> if info.nonzero().size(0) == 0: ... print('LU factorization succeeded for all samples!') LU factorization succeeded for all samples! """ # If get_infos is True, then we don't need to check for errors and vice versa return torch._lu_with_info(A, pivot=pivot, check_errors=(not get_infos)) if TYPE_CHECKING: _ListOrSeq = Sequence[Tensor] else: _ListOrSeq = List[Tensor] def _check_list_size(out_len: int, get_infos: bool, out: _ListOrSeq) -> None: get_infos_int = 1 if get_infos else 0 if out_len - get_infos_int != 2: raise TypeError(f"expected tuple of {2 + int(get_infos)} elements but got {out_len}") if not isinstance(out, (tuple, list)): raise TypeError(f"argument 'out' must be tuple of Tensors, not {type(out).__name__}") def _lu_with_infos(A, pivot=True, get_infos=False, out=None): # type: (Tensor, bool, bool, Optional[Tuple[Tensor, Tensor, Tensor]]) -> Tuple[Tensor, Tensor, Tensor] if not torch.jit.is_scripting(): if type(A) is not Tensor and has_torch_function((A,)): return handle_torch_function( lu, (A,), A, pivot=pivot, get_infos=get_infos, out=out) result = _lu_impl(A, pivot, get_infos, out) if out is not None: _check_list_size(len(out), get_infos, out) for i in range(len(out)): out[i].resize_as_(result[i]).copy_(result[i]) return out else: return result # A_LU, pivots, infos def _lu_no_infos(A, pivot=True, get_infos=False, out=None): # type: (Tensor, bool, bool, Optional[Tuple[Tensor, Tensor]]) -> Tuple[Tensor, Tensor] # need to check for torch_function here so that we exit if if not torch.jit.is_scripting(): if type(A) is not Tensor and has_torch_function((A,)): return handle_torch_function( lu, (A,), A, pivot=pivot, get_infos=get_infos, out=out) result = _lu_impl(A, pivot, get_infos, out) if out is not None: _check_list_size(len(out), get_infos, out) for i in range(len(out)): out[i].resize_as_(result[i]).copy_(result[i]) return out else: return result[0], result[1] # A_LU, pivots # The return type of lu depends on `get_infos`, so in order to resolve the output type # of lu in TorchScript we need to statically know the value of `get_infos` lu = boolean_dispatch( arg_name='get_infos', arg_index=2, default=False, if_true=_lu_with_infos, if_false=_lu_no_infos, module_name=__name__, func_name='lu') lu.__doc__ = _lu_impl.__doc__ def align_tensors(*tensors): raise RuntimeError('`align_tensors` not yet implemented.') ================================================ FILE: patches/transformers/4.5.0/convert_graph_to_onnx.diff ================================================ 14a15,17 > import os > import json > 83a87,91 > "--save-config", > action="store_true", > help="Save the model configuration along with the ONNX", > ) > self.add_argument( 280a289,295 > print('Exporting from PyTorch to ONNX...') > print('input_names', input_names) > print('output_names', output_names) > print('dynamic_axes', dynamic_axes) > print('tokens', tokens) > print('model_args', model_args) > 291a307 > verbose=True 339a356 > save_config: bool = False, 366,367c383,384 < elif len(listdir(output.parent.as_posix())) > 0: < raise Exception(f"Folder {output.parent.as_posix()} is not empty, aborting conversion") --- > #elif len(listdir(output.parent.as_posix())) > 0: > # raise Exception(f"Folder {output.parent.as_posix()} is not empty, aborting conversion") 374c391,407 < --- > > # Save the configuration > if save_config: > config_path = os.path.splitext(output)[0] + '.json' > > config = dict( > model = nlp.model.config.to_dict(), > tokenizer = nlp.tokenizer.init_kwargs > ) > > #nlp.model.config.to_json_file(config_path) > > with open(config_path, 'w') as config_file: > json.dump(config, config_file, indent=2) > > print(f"Saved config to {config_path}") > 468a502 > args.save_config ================================================ FILE: patches/transformers/4.5.0/convert_graph_to_onnx.original.py ================================================ # Copyright 2020 The HuggingFace Team. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from argparse import ArgumentParser from os import listdir, makedirs from pathlib import Path from typing import Dict, List, Optional, Tuple from packaging.version import Version, parse from transformers.file_utils import ModelOutput, is_tf_available, is_torch_available from transformers.pipelines import Pipeline, pipeline from transformers.tokenization_utils import BatchEncoding # This is the minimal required version to # support some ONNX Runtime features ORT_QUANTIZE_MINIMUM_VERSION = parse("1.4.0") SUPPORTED_PIPELINES = [ "feature-extraction", "ner", "sentiment-analysis", "fill-mask", "question-answering", "text-generation", "translation_en_to_fr", "translation_en_to_de", "translation_en_to_ro", ] class OnnxConverterArgumentParser(ArgumentParser): """ Wraps all the script arguments supported to export transformers models to ONNX IR """ def __init__(self): super().__init__("ONNX Converter") self.add_argument( "--pipeline", type=str, choices=SUPPORTED_PIPELINES, default="feature-extraction", ) self.add_argument( "--model", type=str, required=True, help="Model's id or path (ex: bert-base-cased)", ) self.add_argument("--tokenizer", type=str, help="Tokenizer's id or path (ex: bert-base-cased)") self.add_argument( "--framework", type=str, choices=["pt", "tf"], help="Framework for loading the model", ) self.add_argument("--opset", type=int, default=11, help="ONNX opset to use") self.add_argument( "--check-loading", action="store_true", help="Check ONNX is able to load the model", ) self.add_argument( "--use-external-format", action="store_true", help="Allow exporting model >= than 2Gb", ) self.add_argument( "--quantize", action="store_true", help="Quantize the neural network to be run with int8", ) self.add_argument("output") def generate_identified_filename(filename: Path, identifier: str) -> Path: """ Append a string-identifier at the end (before the extension, if any) to the provided filepath Args: filename: pathlib.Path The actual path object we would like to add an identifier suffix identifier: The suffix to add Returns: String with concatenated identifier at the end of the filename """ return filename.parent.joinpath(filename.stem + identifier).with_suffix(filename.suffix) def check_onnxruntime_requirements(minimum_version: Version): """ Check onnxruntime is installed and if the installed version match is recent enough Raises: ImportError: If onnxruntime is not installed or too old version is found """ try: import onnxruntime # Parse the version of the installed onnxruntime ort_version = parse(onnxruntime.__version__) # We require 1.4.0 minimum if ort_version < ORT_QUANTIZE_MINIMUM_VERSION: raise ImportError( f"We found an older version of onnxruntime ({onnxruntime.__version__}) " f"but we require onnxruntime to be >= {minimum_version} to enable all the conversions options.\n" f"Please update onnxruntime by running `pip install --upgrade onnxruntime`" ) except ImportError: raise ImportError( "onnxruntime doesn't seem to be currently installed. " "Please install the onnxruntime by running `pip install onnxruntime`" " and relaunch the conversion." ) def ensure_valid_input(model, tokens, input_names): """ Ensure input are presented in the correct order, without any Non Args: model: The model used to forward the input data tokens: BatchEncoding holding the input data input_names: The name of the inputs Returns: Tuple """ print("Ensuring inputs are in correct order") model_args_name = model.forward.__code__.co_varnames model_args, ordered_input_names = [], [] for arg_name in model_args_name[1:]: # start at index 1 to skip "self" argument if arg_name in input_names: ordered_input_names.append(arg_name) model_args.append(tokens[arg_name]) else: print(f"{arg_name} is not present in the generated input list.") break print(f"Generated inputs order: {ordered_input_names}") return ordered_input_names, tuple(model_args) def infer_shapes(nlp: Pipeline, framework: str) -> Tuple[List[str], List[str], Dict, BatchEncoding]: """ Attempt to infer the static vs dynamic axes for each input and output tensors for a specific model Args: nlp: The pipeline object holding the model to be exported framework: The framework identifier to dispatch to the correct inference scheme (pt/tf) Returns: - List of the inferred input variable names - List of the inferred output variable names - Dictionary with input/output variables names as key and shape tensor as value - a BatchEncoding reference which was used to infer all the above information """ def build_shape_dict(name: str, tensor, is_input: bool, seq_len: int): if isinstance(tensor, (tuple, list)): return [build_shape_dict(name, t, is_input, seq_len) for t in tensor] else: # Let's assume batch is the first axis with only 1 element (~~ might not be always true ...) axes = {[axis for axis, numel in enumerate(tensor.shape) if numel == 1][0]: "batch"} if is_input: if len(tensor.shape) == 2: axes[1] = "sequence" else: raise ValueError(f"Unable to infer tensor axes ({len(tensor.shape)})") else: seq_axes = [dim for dim, shape in enumerate(tensor.shape) if shape == seq_len] axes.update({dim: "sequence" for dim in seq_axes}) print(f"Found {'input' if is_input else 'output'} {name} with shape: {axes}") return axes tokens = nlp.tokenizer("This is a sample output", return_tensors=framework) seq_len = tokens.input_ids.shape[-1] outputs = nlp.model(**tokens) if framework == "pt" else nlp.model(tokens) if isinstance(outputs, ModelOutput): outputs = outputs.to_tuple() if not isinstance(outputs, (list, tuple)): outputs = (outputs,) # Generate input names & axes input_vars = list(tokens.keys()) input_dynamic_axes = {k: build_shape_dict(k, v, True, seq_len) for k, v in tokens.items()} # flatten potentially grouped outputs (past for gpt2, attentions) outputs_flat = [] for output in outputs: if isinstance(output, (tuple, list)): outputs_flat.extend(output) else: outputs_flat.append(output) # Generate output names & axes output_names = [f"output_{i}" for i in range(len(outputs_flat))] output_dynamic_axes = {k: build_shape_dict(k, v, False, seq_len) for k, v in zip(output_names, outputs_flat)} # Create the aggregated axes representation dynamic_axes = dict(input_dynamic_axes, **output_dynamic_axes) return input_vars, output_names, dynamic_axes, tokens def load_graph_from_args( pipeline_name: str, framework: str, model: str, tokenizer: Optional[str] = None, **models_kwargs ) -> Pipeline: """ Convert the set of arguments provided through the CLI to an actual pipeline reference (tokenizer + model Args: pipeline_name: The kind of pipeline to use (ner, question-answering, etc.) framework: The actual model to convert the pipeline from ("pt" or "tf") model: The model name which will be loaded by the pipeline tokenizer: The tokenizer name which will be loaded by the pipeline, default to the model's value Returns: Pipeline object """ # If no tokenizer provided if tokenizer is None: tokenizer = model # Check the wanted framework is available if framework == "pt" and not is_torch_available(): raise Exception("Cannot convert because PyTorch is not installed. Please install torch first.") if framework == "tf" and not is_tf_available(): raise Exception("Cannot convert because TF is not installed. Please install tensorflow first.") print(f"Loading pipeline (model: {model}, tokenizer: {tokenizer})") # Allocate tokenizer and model return pipeline(pipeline_name, model=model, tokenizer=tokenizer, framework=framework, model_kwargs=models_kwargs) def convert_pytorch(nlp: Pipeline, opset: int, output: Path, use_external_format: bool): """ Export a PyTorch backed pipeline to ONNX Intermediate Representation (IR Args: nlp: The pipeline to be exported opset: The actual version of the ONNX operator set to use output: Path where will be stored the generated ONNX model use_external_format: Split the model definition from its parameters to allow model bigger than 2GB Returns: """ if not is_torch_available(): raise Exception("Cannot convert because PyTorch is not installed. Please install torch first.") import torch from torch.onnx import export print(f"Using framework PyTorch: {torch.__version__}") with torch.no_grad(): input_names, output_names, dynamic_axes, tokens = infer_shapes(nlp, "pt") ordered_input_names, model_args = ensure_valid_input(nlp.model, tokens, input_names) export( nlp.model, model_args, f=output.as_posix(), input_names=ordered_input_names, output_names=output_names, dynamic_axes=dynamic_axes, do_constant_folding=True, use_external_data_format=use_external_format, enable_onnx_checker=True, opset_version=opset, ) def convert_tensorflow(nlp: Pipeline, opset: int, output: Path): """ Export a TensorFlow backed pipeline to ONNX Intermediate Representation (IR Args: nlp: The pipeline to be exported opset: The actual version of the ONNX operator set to use output: Path where will be stored the generated ONNX model Notes: TensorFlow cannot export model bigger than 2GB due to internal constraint from TensorFlow """ if not is_tf_available(): raise Exception("Cannot convert because TF is not installed. Please install tensorflow first.") print("/!\\ Please note TensorFlow doesn't support exporting model > 2Gb /!\\") try: import tensorflow as tf from keras2onnx import __version__ as k2ov from keras2onnx import convert_keras, save_model print(f"Using framework TensorFlow: {tf.version.VERSION}, keras2onnx: {k2ov}") # Build input_names, output_names, dynamic_axes, tokens = infer_shapes(nlp, "tf") # Forward nlp.model.predict(tokens.data) onnx_model = convert_keras(nlp.model, nlp.model.name, target_opset=opset) save_model(onnx_model, output.as_posix()) except ImportError as e: raise Exception(f"Cannot import {e.name} required to convert TF model to ONNX. Please install {e.name} first.") def convert( framework: str, model: str, output: Path, opset: int, tokenizer: Optional[str] = None, use_external_format: bool = False, pipeline_name: str = "feature-extraction", **model_kwargs ): """ Convert the pipeline object to the ONNX Intermediate Representation (IR) format Args: framework: The framework the pipeline is backed by ("pt" or "tf") model: The name of the model to load for the pipeline output: The path where the ONNX graph will be stored opset: The actual version of the ONNX operator set to use tokenizer: The name of the model to load for the pipeline, default to the model's name if not provided use_external_format: Split the model definition from its parameters to allow model bigger than 2GB (PyTorch only) pipeline_name: The kind of pipeline to instantiate (ner, question-answering, etc.) model_kwargs: Keyword arguments to be forwarded to the model constructor Returns: """ print(f"ONNX opset version set to: {opset}") # Load the pipeline nlp = load_graph_from_args(pipeline_name, framework, model, tokenizer, **model_kwargs) if not output.parent.exists(): print(f"Creating folder {output.parent}") makedirs(output.parent.as_posix()) elif len(listdir(output.parent.as_posix())) > 0: raise Exception(f"Folder {output.parent.as_posix()} is not empty, aborting conversion") # Export the graph if framework == "pt": convert_pytorch(nlp, opset, output, use_external_format) else: convert_tensorflow(nlp, opset, output) def optimize(onnx_model_path: Path) -> Path: """ Load the model at the specified path and let onnxruntime look at transformations on the graph to enable all the optimizations possibl Args: onnx_model_path: filepath where the model binary description is stored Returns: Path where the optimized model binary description has been saved """ from onnxruntime import InferenceSession, SessionOptions # Generate model name with suffix "optimized" opt_model_path = generate_identified_filename(onnx_model_path, "-optimized") sess_option = SessionOptions() sess_option.optimized_model_filepath = opt_model_path.as_posix() _ = InferenceSession(onnx_model_path.as_posix(), sess_option) print(f"Optimized model has been written at {opt_model_path}: \N{heavy check mark}") print("/!\\ Optimized model contains hardware specific operators which might not be portable. /!\\") return opt_model_path def quantize(onnx_model_path: Path) -> Path: """ Quantize the weights of the model from float32 to in8 to allow very efficient inference on modern CPU Args: onnx_model_path: Path to location the exported ONNX model is stored Returns: The Path generated for the quantized """ import onnx from onnxruntime.quantization import QuantizationMode, quantize onnx_model = onnx.load(onnx_model_path.as_posix()) # Discussed with @yufenglee from ONNX runtime, this will be address in the next release of onnxruntime print( "As of onnxruntime 1.4.0, models larger than 2GB will fail to quantize due to protobuf constraint.\n" "This limitation will be removed in the next release of onnxruntime." ) quantized_model = quantize( model=onnx_model, quantization_mode=QuantizationMode.IntegerOps, force_fusions=True, symmetric_weight=True, ) # Append "-quantized" at the end of the model's name quantized_model_path = generate_identified_filename(onnx_model_path, "-quantized") # Save model print(f"Quantized model has been written at {quantized_model_path}: \N{heavy check mark}") onnx.save_model(quantized_model, quantized_model_path.as_posix()) return quantized_model_path def verify(path: Path): from onnxruntime import InferenceSession, SessionOptions from onnxruntime.capi.onnxruntime_pybind11_state import RuntimeException print(f"Checking ONNX model loading from: {path} ...") try: onnx_options = SessionOptions() _ = InferenceSession(path.as_posix(), onnx_options, providers=["CPUExecutionProvider"]) print(f"Model {path} correctly loaded: \N{heavy check mark}") except RuntimeException as re: print(f"Error while loading the model {re}: \N{heavy ballot x}") if __name__ == "__main__": parser = OnnxConverterArgumentParser() args = parser.parse_args() # Make sure output is absolute path args.output = Path(args.output).absolute() try: print("\n====== Converting model to ONNX ======") # Convert convert( args.framework, args.model, args.output, args.opset, args.tokenizer, args.use_external_format, args.pipeline, ) if args.quantize: # Ensure requirements for quantization on onnxruntime is met check_onnxruntime_requirements(ORT_QUANTIZE_MINIMUM_VERSION) # onnxruntime optimizations doesn't provide the same level of performances on TensorFlow than PyTorch if args.framework == "tf": print( "\t Using TensorFlow might not provide the same optimization level compared to PyTorch.\n" "\t For TensorFlow users you can try optimizing the model directly through onnxruntime_tools.\n" "\t For more information, please refer to the onnxruntime documentation:\n" "\t\thttps://github.com/microsoft/onnxruntime/tree/master/onnxruntime/python/tools/transformers\n" ) print("\n====== Optimizing ONNX model ======") # Quantization works best when using the optimized version of the model args.optimized_output = optimize(args.output) # Do the quantization on the right graph args.quantized_output = quantize(args.optimized_output) # And verify if args.check_loading: print("\n====== Check exported ONNX model(s) ======") verify(args.output) if hasattr(args, "optimized_output"): verify(args.optimized_output) if hasattr(args, "quantized_output"): verify(args.quantized_output) except Exception as e: print(f"Error while converting the model: {e}") exit(1) ================================================ FILE: patches/transformers/4.5.0/convert_graph_to_onnx.py ================================================ # Copyright 2020 The HuggingFace Team. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import os import json from argparse import ArgumentParser from os import listdir, makedirs from pathlib import Path from typing import Dict, List, Optional, Tuple from packaging.version import Version, parse from transformers.file_utils import ModelOutput, is_tf_available, is_torch_available from transformers.pipelines import Pipeline, pipeline from transformers.tokenization_utils import BatchEncoding # This is the minimal required version to # support some ONNX Runtime features ORT_QUANTIZE_MINIMUM_VERSION = parse("1.4.0") SUPPORTED_PIPELINES = [ "feature-extraction", "ner", "sentiment-analysis", "fill-mask", "question-answering", "text-generation", "translation_en_to_fr", "translation_en_to_de", "translation_en_to_ro", ] class OnnxConverterArgumentParser(ArgumentParser): """ Wraps all the script arguments supported to export transformers models to ONNX IR """ def __init__(self): super().__init__("ONNX Converter") self.add_argument( "--pipeline", type=str, choices=SUPPORTED_PIPELINES, default="feature-extraction", ) self.add_argument( "--model", type=str, required=True, help="Model's id or path (ex: bert-base-cased)", ) self.add_argument("--tokenizer", type=str, help="Tokenizer's id or path (ex: bert-base-cased)") self.add_argument( "--framework", type=str, choices=["pt", "tf"], help="Framework for loading the model", ) self.add_argument("--opset", type=int, default=11, help="ONNX opset to use") self.add_argument( "--check-loading", action="store_true", help="Check ONNX is able to load the model", ) self.add_argument( "--use-external-format", action="store_true", help="Allow exporting model >= than 2Gb", ) self.add_argument( "--save-config", action="store_true", help="Save the model configuration along with the ONNX", ) self.add_argument( "--quantize", action="store_true", help="Quantize the neural network to be run with int8", ) self.add_argument("output") def generate_identified_filename(filename: Path, identifier: str) -> Path: """ Append a string-identifier at the end (before the extension, if any) to the provided filepath Args: filename: pathlib.Path The actual path object we would like to add an identifier suffix identifier: The suffix to add Returns: String with concatenated identifier at the end of the filename """ return filename.parent.joinpath(filename.stem + identifier).with_suffix(filename.suffix) def check_onnxruntime_requirements(minimum_version: Version): """ Check onnxruntime is installed and if the installed version match is recent enough Raises: ImportError: If onnxruntime is not installed or too old version is found """ try: import onnxruntime # Parse the version of the installed onnxruntime ort_version = parse(onnxruntime.__version__) # We require 1.4.0 minimum if ort_version < ORT_QUANTIZE_MINIMUM_VERSION: raise ImportError( f"We found an older version of onnxruntime ({onnxruntime.__version__}) " f"but we require onnxruntime to be >= {minimum_version} to enable all the conversions options.\n" f"Please update onnxruntime by running `pip install --upgrade onnxruntime`" ) except ImportError: raise ImportError( "onnxruntime doesn't seem to be currently installed. " "Please install the onnxruntime by running `pip install onnxruntime`" " and relaunch the conversion." ) def ensure_valid_input(model, tokens, input_names): """ Ensure input are presented in the correct order, without any Non Args: model: The model used to forward the input data tokens: BatchEncoding holding the input data input_names: The name of the inputs Returns: Tuple """ print("Ensuring inputs are in correct order") model_args_name = model.forward.__code__.co_varnames model_args, ordered_input_names = [], [] for arg_name in model_args_name[1:]: # start at index 1 to skip "self" argument if arg_name in input_names: ordered_input_names.append(arg_name) model_args.append(tokens[arg_name]) else: print(f"{arg_name} is not present in the generated input list.") break print(f"Generated inputs order: {ordered_input_names}") return ordered_input_names, tuple(model_args) def infer_shapes(nlp: Pipeline, framework: str) -> Tuple[List[str], List[str], Dict, BatchEncoding]: """ Attempt to infer the static vs dynamic axes for each input and output tensors for a specific model Args: nlp: The pipeline object holding the model to be exported framework: The framework identifier to dispatch to the correct inference scheme (pt/tf) Returns: - List of the inferred input variable names - List of the inferred output variable names - Dictionary with input/output variables names as key and shape tensor as value - a BatchEncoding reference which was used to infer all the above information """ def build_shape_dict(name: str, tensor, is_input: bool, seq_len: int): if isinstance(tensor, (tuple, list)): return [build_shape_dict(name, t, is_input, seq_len) for t in tensor] else: # Let's assume batch is the first axis with only 1 element (~~ might not be always true ...) axes = {[axis for axis, numel in enumerate(tensor.shape) if numel == 1][0]: "batch"} if is_input: if len(tensor.shape) == 2: axes[1] = "sequence" else: raise ValueError(f"Unable to infer tensor axes ({len(tensor.shape)})") else: seq_axes = [dim for dim, shape in enumerate(tensor.shape) if shape == seq_len] axes.update({dim: "sequence" for dim in seq_axes}) print(f"Found {'input' if is_input else 'output'} {name} with shape: {axes}") return axes tokens = nlp.tokenizer("This is a sample output", return_tensors=framework) seq_len = tokens.input_ids.shape[-1] outputs = nlp.model(**tokens) if framework == "pt" else nlp.model(tokens) if isinstance(outputs, ModelOutput): outputs = outputs.to_tuple() if not isinstance(outputs, (list, tuple)): outputs = (outputs,) # Generate input names & axes input_vars = list(tokens.keys()) input_dynamic_axes = {k: build_shape_dict(k, v, True, seq_len) for k, v in tokens.items()} # flatten potentially grouped outputs (past for gpt2, attentions) outputs_flat = [] for output in outputs: if isinstance(output, (tuple, list)): outputs_flat.extend(output) else: outputs_flat.append(output) # Generate output names & axes output_names = [f"output_{i}" for i in range(len(outputs_flat))] output_dynamic_axes = {k: build_shape_dict(k, v, False, seq_len) for k, v in zip(output_names, outputs_flat)} # Create the aggregated axes representation dynamic_axes = dict(input_dynamic_axes, **output_dynamic_axes) return input_vars, output_names, dynamic_axes, tokens def load_graph_from_args( pipeline_name: str, framework: str, model: str, tokenizer: Optional[str] = None, **models_kwargs ) -> Pipeline: """ Convert the set of arguments provided through the CLI to an actual pipeline reference (tokenizer + model Args: pipeline_name: The kind of pipeline to use (ner, question-answering, etc.) framework: The actual model to convert the pipeline from ("pt" or "tf") model: The model name which will be loaded by the pipeline tokenizer: The tokenizer name which will be loaded by the pipeline, default to the model's value Returns: Pipeline object """ # If no tokenizer provided if tokenizer is None: tokenizer = model # Check the wanted framework is available if framework == "pt" and not is_torch_available(): raise Exception("Cannot convert because PyTorch is not installed. Please install torch first.") if framework == "tf" and not is_tf_available(): raise Exception("Cannot convert because TF is not installed. Please install tensorflow first.") print(f"Loading pipeline (model: {model}, tokenizer: {tokenizer})") # Allocate tokenizer and model return pipeline(pipeline_name, model=model, tokenizer=tokenizer, framework=framework, model_kwargs=models_kwargs) def convert_pytorch(nlp: Pipeline, opset: int, output: Path, use_external_format: bool): """ Export a PyTorch backed pipeline to ONNX Intermediate Representation (IR Args: nlp: The pipeline to be exported opset: The actual version of the ONNX operator set to use output: Path where will be stored the generated ONNX model use_external_format: Split the model definition from its parameters to allow model bigger than 2GB Returns: """ if not is_torch_available(): raise Exception("Cannot convert because PyTorch is not installed. Please install torch first.") import torch from torch.onnx import export print(f"Using framework PyTorch: {torch.__version__}") with torch.no_grad(): input_names, output_names, dynamic_axes, tokens = infer_shapes(nlp, "pt") ordered_input_names, model_args = ensure_valid_input(nlp.model, tokens, input_names) print('Exporting from PyTorch to ONNX...') print('input_names', input_names) print('output_names', output_names) print('dynamic_axes', dynamic_axes) print('tokens', tokens) print('model_args', model_args) export( nlp.model, model_args, f=output.as_posix(), input_names=ordered_input_names, output_names=output_names, dynamic_axes=dynamic_axes, do_constant_folding=True, use_external_data_format=use_external_format, enable_onnx_checker=True, opset_version=opset, verbose=True ) def convert_tensorflow(nlp: Pipeline, opset: int, output: Path): """ Export a TensorFlow backed pipeline to ONNX Intermediate Representation (IR Args: nlp: The pipeline to be exported opset: The actual version of the ONNX operator set to use output: Path where will be stored the generated ONNX model Notes: TensorFlow cannot export model bigger than 2GB due to internal constraint from TensorFlow """ if not is_tf_available(): raise Exception("Cannot convert because TF is not installed. Please install tensorflow first.") print("/!\\ Please note TensorFlow doesn't support exporting model > 2Gb /!\\") try: import tensorflow as tf from keras2onnx import __version__ as k2ov from keras2onnx import convert_keras, save_model print(f"Using framework TensorFlow: {tf.version.VERSION}, keras2onnx: {k2ov}") # Build input_names, output_names, dynamic_axes, tokens = infer_shapes(nlp, "tf") # Forward nlp.model.predict(tokens.data) onnx_model = convert_keras(nlp.model, nlp.model.name, target_opset=opset) save_model(onnx_model, output.as_posix()) except ImportError as e: raise Exception(f"Cannot import {e.name} required to convert TF model to ONNX. Please install {e.name} first.") def convert( framework: str, model: str, output: Path, opset: int, tokenizer: Optional[str] = None, use_external_format: bool = False, pipeline_name: str = "feature-extraction", save_config: bool = False, **model_kwargs ): """ Convert the pipeline object to the ONNX Intermediate Representation (IR) format Args: framework: The framework the pipeline is backed by ("pt" or "tf") model: The name of the model to load for the pipeline output: The path where the ONNX graph will be stored opset: The actual version of the ONNX operator set to use tokenizer: The name of the model to load for the pipeline, default to the model's name if not provided use_external_format: Split the model definition from its parameters to allow model bigger than 2GB (PyTorch only) pipeline_name: The kind of pipeline to instantiate (ner, question-answering, etc.) model_kwargs: Keyword arguments to be forwarded to the model constructor Returns: """ print(f"ONNX opset version set to: {opset}") # Load the pipeline nlp = load_graph_from_args(pipeline_name, framework, model, tokenizer, **model_kwargs) if not output.parent.exists(): print(f"Creating folder {output.parent}") makedirs(output.parent.as_posix()) #elif len(listdir(output.parent.as_posix())) > 0: # raise Exception(f"Folder {output.parent.as_posix()} is not empty, aborting conversion") # Export the graph if framework == "pt": convert_pytorch(nlp, opset, output, use_external_format) else: convert_tensorflow(nlp, opset, output) # Save the configuration if save_config: config_path = os.path.splitext(output)[0] + '.json' config = dict( model = nlp.model.config.to_dict(), tokenizer = nlp.tokenizer.init_kwargs ) #nlp.model.config.to_json_file(config_path) with open(config_path, 'w') as config_file: json.dump(config, config_file, indent=2) print(f"Saved config to {config_path}") def optimize(onnx_model_path: Path) -> Path: """ Load the model at the specified path and let onnxruntime look at transformations on the graph to enable all the optimizations possibl Args: onnx_model_path: filepath where the model binary description is stored Returns: Path where the optimized model binary description has been saved """ from onnxruntime import InferenceSession, SessionOptions # Generate model name with suffix "optimized" opt_model_path = generate_identified_filename(onnx_model_path, "-optimized") sess_option = SessionOptions() sess_option.optimized_model_filepath = opt_model_path.as_posix() _ = InferenceSession(onnx_model_path.as_posix(), sess_option) print(f"Optimized model has been written at {opt_model_path}: \N{heavy check mark}") print("/!\\ Optimized model contains hardware specific operators which might not be portable. /!\\") return opt_model_path def quantize(onnx_model_path: Path) -> Path: """ Quantize the weights of the model from float32 to in8 to allow very efficient inference on modern CPU Args: onnx_model_path: Path to location the exported ONNX model is stored Returns: The Path generated for the quantized """ import onnx from onnxruntime.quantization import QuantizationMode, quantize onnx_model = onnx.load(onnx_model_path.as_posix()) # Discussed with @yufenglee from ONNX runtime, this will be address in the next release of onnxruntime print( "As of onnxruntime 1.4.0, models larger than 2GB will fail to quantize due to protobuf constraint.\n" "This limitation will be removed in the next release of onnxruntime." ) quantized_model = quantize( model=onnx_model, quantization_mode=QuantizationMode.IntegerOps, force_fusions=True, symmetric_weight=True, ) # Append "-quantized" at the end of the model's name quantized_model_path = generate_identified_filename(onnx_model_path, "-quantized") # Save model print(f"Quantized model has been written at {quantized_model_path}: \N{heavy check mark}") onnx.save_model(quantized_model, quantized_model_path.as_posix()) return quantized_model_path def verify(path: Path): from onnxruntime import InferenceSession, SessionOptions from onnxruntime.capi.onnxruntime_pybind11_state import RuntimeException print(f"Checking ONNX model loading from: {path} ...") try: onnx_options = SessionOptions() _ = InferenceSession(path.as_posix(), onnx_options, providers=["CPUExecutionProvider"]) print(f"Model {path} correctly loaded: \N{heavy check mark}") except RuntimeException as re: print(f"Error while loading the model {re}: \N{heavy ballot x}") if __name__ == "__main__": parser = OnnxConverterArgumentParser() args = parser.parse_args() # Make sure output is absolute path args.output = Path(args.output).absolute() try: print("\n====== Converting model to ONNX ======") # Convert convert( args.framework, args.model, args.output, args.opset, args.tokenizer, args.use_external_format, args.pipeline, args.save_config ) if args.quantize: # Ensure requirements for quantization on onnxruntime is met check_onnxruntime_requirements(ORT_QUANTIZE_MINIMUM_VERSION) # onnxruntime optimizations doesn't provide the same level of performances on TensorFlow than PyTorch if args.framework == "tf": print( "\t Using TensorFlow might not provide the same optimization level compared to PyTorch.\n" "\t For TensorFlow users you can try optimizing the model directly through onnxruntime_tools.\n" "\t For more information, please refer to the onnxruntime documentation:\n" "\t\thttps://github.com/microsoft/onnxruntime/tree/master/onnxruntime/python/tools/transformers\n" ) print("\n====== Optimizing ONNX model ======") # Quantization works best when using the optimized version of the model args.optimized_output = optimize(args.output) # Do the quantization on the right graph args.quantized_output = quantize(args.optimized_output) # And verify if args.check_loading: print("\n====== Check exported ONNX model(s) ======") verify(args.output) if hasattr(args, "optimized_output"): verify(args.optimized_output) if hasattr(args, "quantized_output"): verify(args.quantized_output) except Exception as e: print(f"Error while converting the model: {e}") exit(1) ================================================ FILE: patches/transformers/4.5.0/modeling_distilbert.py ================================================ # coding=utf-8 # Copyright 2019-present, the HuggingFace Inc. team, The Google AI Language Team and Facebook, Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ PyTorch DistilBERT model adapted in part from Facebook, Inc XLM model (https://github.com/facebookresearch/XLM) and in part from HuggingFace PyTorch version of Google AI Bert model (https://github.com/google-research/bert) """ import copy import math import numpy as np import torch import torch.nn as nn from torch.nn import CrossEntropyLoss from ...activations import gelu from ...file_utils import ( add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, replace_return_docstrings, ) from ...modeling_outputs import ( BaseModelOutput, MaskedLMOutput, MultipleChoiceModelOutput, QuestionAnsweringModelOutput, SequenceClassifierOutput, TokenClassifierOutput, ) from ...modeling_utils import ( PreTrainedModel, apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer, ) from ...utils import logging from .configuration_distilbert import DistilBertConfig logger = logging.get_logger(__name__) _CHECKPOINT_FOR_DOC = "distilbert-base-uncased" _CONFIG_FOR_DOC = "DistilBertConfig" _TOKENIZER_FOR_DOC = "DistilBertTokenizer" DISTILBERT_PRETRAINED_MODEL_ARCHIVE_LIST = [ "distilbert-base-uncased", "distilbert-base-uncased-distilled-squad", "distilbert-base-cased", "distilbert-base-cased-distilled-squad", "distilbert-base-german-cased", "distilbert-base-multilingual-cased", "distilbert-base-uncased-finetuned-sst-2-english", # See all DistilBERT models at https://huggingface.co/models?filter=distilbert ] # UTILS AND BUILDING BLOCKS OF THE ARCHITECTURE # def create_sinusoidal_embeddings(n_pos, dim, out): position_enc = np.array([[pos / np.power(10000, 2 * (j // 2) / dim) for j in range(dim)] for pos in range(n_pos)]) out.requires_grad = False out[:, 0::2] = torch.FloatTensor(np.sin(position_enc[:, 0::2])) out[:, 1::2] = torch.FloatTensor(np.cos(position_enc[:, 1::2])) out.detach_() class Embeddings(nn.Module): def __init__(self, config): super().__init__() self.word_embeddings = nn.Embedding(config.vocab_size, config.dim, padding_idx=config.pad_token_id) self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.dim) if config.sinusoidal_pos_embds: create_sinusoidal_embeddings( n_pos=config.max_position_embeddings, dim=config.dim, out=self.position_embeddings.weight ) self.LayerNorm = nn.LayerNorm(config.dim, eps=1e-12) self.dropout = nn.Dropout(config.dropout) def forward(self, input_ids): """ Parameters: input_ids: torch.tensor(bs, max_seq_length) The token ids to embed. Returns: torch.tensor(bs, max_seq_length, dim) The embedded tokens (plus position embeddings, no token_type embeddings) """ seq_length = input_ids.size(1) position_ids = torch.arange(seq_length, dtype=torch.long, device=input_ids.device) # (max_seq_length) position_ids = position_ids.unsqueeze(0).expand_as(input_ids) # (bs, max_seq_length) word_embeddings = self.word_embeddings(input_ids) # (bs, max_seq_length, dim) position_embeddings = self.position_embeddings(position_ids) # (bs, max_seq_length, dim) embeddings = word_embeddings + position_embeddings # (bs, max_seq_length, dim) embeddings = self.LayerNorm(embeddings) # (bs, max_seq_length, dim) embeddings = self.dropout(embeddings) # (bs, max_seq_length, dim) return embeddings class MultiHeadSelfAttention(nn.Module): def __init__(self, config): super().__init__() self.n_heads = config.n_heads self.dim = config.dim self.dropout = nn.Dropout(p=config.attention_dropout) assert self.dim % self.n_heads == 0 self.q_lin = nn.Linear(in_features=config.dim, out_features=config.dim) self.k_lin = nn.Linear(in_features=config.dim, out_features=config.dim) self.v_lin = nn.Linear(in_features=config.dim, out_features=config.dim) self.out_lin = nn.Linear(in_features=config.dim, out_features=config.dim) self.pruned_heads = set() def prune_heads(self, heads): attention_head_size = self.dim // self.n_heads if len(heads) == 0: return heads, index = find_pruneable_heads_and_indices(heads, self.n_heads, attention_head_size, self.pruned_heads) # Prune linear layers self.q_lin = prune_linear_layer(self.q_lin, index) self.k_lin = prune_linear_layer(self.k_lin, index) self.v_lin = prune_linear_layer(self.v_lin, index) self.out_lin = prune_linear_layer(self.out_lin, index, dim=1) # Update hyper params self.n_heads = self.n_heads - len(heads) self.dim = attention_head_size * self.n_heads self.pruned_heads = self.pruned_heads.union(heads) def forward(self, query, key, value, mask, head_mask=None, output_attentions=False): """ Parameters: query: torch.tensor(bs, seq_length, dim) key: torch.tensor(bs, seq_length, dim) value: torch.tensor(bs, seq_length, dim) mask: torch.tensor(bs, seq_length) Returns: weights: torch.tensor(bs, n_heads, seq_length, seq_length) Attention weights context: torch.tensor(bs, seq_length, dim) Contextualized layer. Optional: only if `output_attentions=True` """ bs, q_length, dim = query.size() k_length = key.size(1) # assert dim == self.dim, f'Dimensions do not match: {dim} input vs {self.dim} configured' # assert key.size() == value.size() dim_per_head = self.dim // self.n_heads mask_reshp = (bs, 1, 1, k_length) def shape(x): """ separate heads """ return x.view(bs, -1, self.n_heads, dim_per_head).transpose(1, 2) def unshape(x): """ group heads """ return x.transpose(1, 2).contiguous().view(bs, -1, self.n_heads * dim_per_head) q = shape(self.q_lin(query)) # (bs, n_heads, q_length, dim_per_head) k = shape(self.k_lin(key)) # (bs, n_heads, k_length, dim_per_head) v = shape(self.v_lin(value)) # (bs, n_heads, k_length, dim_per_head) q = q / math.sqrt(dim_per_head) # (bs, n_heads, q_length, dim_per_head) scores = torch.matmul(q, k.transpose(2, 3)) # (bs, n_heads, q_length, k_length) mask = mask.view(mask_reshp).expand_as(scores) # (bs, n_heads, q_length, k_length) scores.masked_fill_((mask == 0), -float("inf")) # (bs, n_heads, q_length, k_length) weights = nn.Softmax(dim=-1)(scores) # (bs, n_heads, q_length, k_length) weights = self.dropout(weights) # (bs, n_heads, q_length, k_length) # Mask heads if we want to if head_mask is not None: weights = weights * head_mask context = torch.matmul(weights, v) # (bs, n_heads, q_length, dim_per_head) context = unshape(context) # (bs, q_length, dim) context = self.out_lin(context) # (bs, q_length, dim) if output_attentions: return (context, weights) else: return (context,) class FFN(nn.Module): def __init__(self, config): super().__init__() self.dropout = nn.Dropout(p=config.dropout) self.chunk_size_feed_forward = config.chunk_size_feed_forward self.seq_len_dim = 1 self.lin1 = nn.Linear(in_features=config.dim, out_features=config.hidden_dim) self.lin2 = nn.Linear(in_features=config.hidden_dim, out_features=config.dim) assert config.activation in ["relu", "gelu"], f"activation ({config.activation}) must be in ['relu', 'gelu']" self.activation = gelu if config.activation == "gelu" else nn.ReLU() def forward(self, input): return apply_chunking_to_forward(self.ff_chunk, self.chunk_size_feed_forward, self.seq_len_dim, input) def ff_chunk(self, input): x = self.lin1(input) x = self.activation(x) x = self.lin2(x) x = self.dropout(x) return x class TransformerBlock(nn.Module): def __init__(self, config): super().__init__() assert config.dim % config.n_heads == 0 self.attention = MultiHeadSelfAttention(config) self.sa_layer_norm = nn.LayerNorm(normalized_shape=config.dim, eps=1e-12) self.ffn = FFN(config) self.output_layer_norm = nn.LayerNorm(normalized_shape=config.dim, eps=1e-12) def forward(self, x, attn_mask=None, head_mask=None, output_attentions=False): """ Parameters: x: torch.tensor(bs, seq_length, dim) attn_mask: torch.tensor(bs, seq_length) Returns: sa_weights: torch.tensor(bs, n_heads, seq_length, seq_length) The attention weights ffn_output: torch.tensor(bs, seq_length, dim) The output of the transformer block contextualization. """ # Self-Attention sa_output = self.attention( query=x, key=x, value=x, mask=attn_mask, head_mask=head_mask, output_attentions=output_attentions, ) if output_attentions: sa_output, sa_weights = sa_output # (bs, seq_length, dim), (bs, n_heads, seq_length, seq_length) else: # To handle these `output_attentions` or `output_hidden_states` cases returning tuples assert type(sa_output) == tuple sa_output = sa_output[0] sa_output = self.sa_layer_norm(sa_output + x) # (bs, seq_length, dim) # Feed Forward Network ffn_output = self.ffn(sa_output) # (bs, seq_length, dim) ffn_output = self.output_layer_norm(ffn_output + sa_output) # (bs, seq_length, dim) output = (ffn_output,) if output_attentions: output = (sa_weights,) + output return output class Transformer(nn.Module): def __init__(self, config): super().__init__() self.n_layers = config.n_layers layer = TransformerBlock(config) self.layer = nn.ModuleList([copy.deepcopy(layer) for _ in range(config.n_layers)]) def forward( self, x, attn_mask=None, head_mask=None, output_attentions=False, output_hidden_states=False, return_dict=None ): # docstyle-ignore """ Parameters: x: torch.tensor(bs, seq_length, dim) Input sequence embedded. attn_mask: torch.tensor(bs, seq_length) Attention mask on the sequence. Returns: hidden_state: torch.tensor(bs, seq_length, dim) Sequence of hidden states in the last (top) layer all_hidden_states: Tuple[torch.tensor(bs, seq_length, dim)] Tuple of length n_layers with the hidden states from each layer. Optional: only if output_hidden_states=True all_attentions: Tuple[torch.tensor(bs, n_heads, seq_length, seq_length)] Tuple of length n_layers with the attention weights from each layer Optional: only if output_attentions=True """ all_hidden_states = () if output_hidden_states else None all_attentions = () if output_attentions else None hidden_state = x for i, layer_module in enumerate(self.layer): if output_hidden_states: all_hidden_states = all_hidden_states + (hidden_state,) layer_outputs = layer_module( x=hidden_state, attn_mask=attn_mask, head_mask=head_mask[i], output_attentions=output_attentions ) hidden_state = layer_outputs[-1] if output_attentions: assert len(layer_outputs) == 2 attentions = layer_outputs[0] all_attentions = all_attentions + (attentions,) else: assert len(layer_outputs) == 1 # Add last layer if output_hidden_states: all_hidden_states = all_hidden_states + (hidden_state,) if not return_dict: return tuple(v for v in [hidden_state, all_hidden_states, all_attentions] if v is not None) return BaseModelOutput( last_hidden_state=hidden_state, hidden_states=all_hidden_states, attentions=all_attentions ) # INTERFACE FOR ENCODER AND TASK SPECIFIC MODEL # class DistilBertPreTrainedModel(PreTrainedModel): """ An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained models. """ config_class = DistilBertConfig load_tf_weights = None base_model_prefix = "distilbert" def _init_weights(self, module): """Initialize the weights.""" if isinstance(module, nn.Linear): # Slightly different from the TF version which uses truncated_normal for initialization # cf https://github.com/pytorch/pytorch/pull/5617 module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) if module.bias is not None: module.bias.data.zero_() elif isinstance(module, nn.Embedding): module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) if module.padding_idx is not None: module.weight.data[module.padding_idx].zero_() elif isinstance(module, nn.LayerNorm): module.bias.data.zero_() module.weight.data.fill_(1.0) DISTILBERT_START_DOCSTRING = r""" This model inherits from :class:`~transformers.PreTrainedModel`. Check the superclass documentation for the generic methods the library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads etc.) This model is also a PyTorch `torch.nn.Module `__ subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and behavior. Parameters: config (:class:`~transformers.DistilBertConfig`): Model configuration class with all the parameters of the model. Initializing with a config file does not load the weights associated with the model, only the configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights. """ DISTILBERT_INPUTS_DOCSTRING = r""" Args: input_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`): Indices of input sequence tokens in the vocabulary. Indices can be obtained using :class:`~transformers.DistilBertTokenizer`. See :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for details. `What are input IDs? <../glossary.html#input-ids>`__ attention_mask (:obj:`torch.FloatTensor` of shape :obj:`({0})`, `optional`): Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``: - 1 for tokens that are **not masked**, - 0 for tokens that are **masked**. `What are attention masks? <../glossary.html#attention-mask>`__ head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`): Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``: - 1 indicates the head is **not masked**, - 0 indicates the head is **masked**. inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`({0}, hidden_size)`, `optional`): Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation. This is useful if you want more control over how to convert :obj:`input_ids` indices into associated vectors than the model's internal embedding lookup matrix. output_attentions (:obj:`bool`, `optional`): Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned tensors for more detail. output_hidden_states (:obj:`bool`, `optional`): Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for more detail. return_dict (:obj:`bool`, `optional`): Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple. """ @add_start_docstrings( "The bare DistilBERT encoder/transformer outputting raw hidden-states without any specific head on top.", DISTILBERT_START_DOCSTRING, ) class DistilBertModel(DistilBertPreTrainedModel): def __init__(self, config): super().__init__(config) self.embeddings = Embeddings(config) # Embeddings self.transformer = Transformer(config) # Encoder self.init_weights() def get_input_embeddings(self): return self.embeddings.word_embeddings def set_input_embeddings(self, new_embeddings): self.embeddings.word_embeddings = new_embeddings def _prune_heads(self, heads_to_prune): """ Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base class PreTrainedModel """ for layer, heads in heads_to_prune.items(): self.transformer.layer[layer].attention.prune_heads(heads) @add_start_docstrings_to_model_forward(DISTILBERT_INPUTS_DOCSTRING.format("batch_size, num_choices")) @add_code_sample_docstrings( tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint=_CHECKPOINT_FOR_DOC, output_type=BaseModelOutput, config_class=_CONFIG_FOR_DOC, ) def forward( self, input_ids=None, attention_mask=None, head_mask=None, inputs_embeds=None, output_attentions=None, output_hidden_states=None, return_dict=None, ): output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) return_dict = return_dict if return_dict is not None else self.config.use_return_dict if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") elif input_ids is not None: input_shape = input_ids.size() elif inputs_embeds is not None: input_shape = inputs_embeds.size()[:-1] else: raise ValueError("You have to specify either input_ids or inputs_embeds") device = input_ids.device if input_ids is not None else inputs_embeds.device if attention_mask is None: attention_mask = torch.ones(input_shape, device=device) # (bs, seq_length) # Prepare head mask if needed head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers) if inputs_embeds is None: inputs_embeds = self.embeddings(input_ids) # (bs, seq_length, dim) return self.transformer( x=inputs_embeds, attn_mask=attention_mask, head_mask=head_mask, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict, ) @add_start_docstrings( """DistilBert Model with a `masked language modeling` head on top. """, DISTILBERT_START_DOCSTRING, ) class DistilBertForMaskedLM(DistilBertPreTrainedModel): def __init__(self, config): super().__init__(config) self.distilbert = DistilBertModel(config) self.vocab_transform = nn.Linear(config.dim, config.dim) self.vocab_layer_norm = nn.LayerNorm(config.dim, eps=1e-12) self.vocab_projector = nn.Linear(config.dim, config.vocab_size) self.init_weights() self.mlm_loss_fct = nn.CrossEntropyLoss() def get_output_embeddings(self): return self.vocab_projector def set_output_embeddings(self, new_embeddings): self.vocab_projector = new_embeddings @add_start_docstrings_to_model_forward(DISTILBERT_INPUTS_DOCSTRING.format("batch_size, num_choices")) @add_code_sample_docstrings( tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint=_CHECKPOINT_FOR_DOC, output_type=MaskedLMOutput, config_class=_CONFIG_FOR_DOC, ) def forward( self, input_ids=None, attention_mask=None, head_mask=None, inputs_embeds=None, labels=None, output_attentions=None, output_hidden_states=None, return_dict=None, ): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`): Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``. """ return_dict = return_dict if return_dict is not None else self.config.use_return_dict dlbrt_output = self.distilbert( input_ids=input_ids, attention_mask=attention_mask, head_mask=head_mask, inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict, ) hidden_states = dlbrt_output[0] # (bs, seq_length, dim) prediction_logits = self.vocab_transform(hidden_states) # (bs, seq_length, dim) prediction_logits = gelu(prediction_logits) # (bs, seq_length, dim) prediction_logits = self.vocab_layer_norm(prediction_logits) # (bs, seq_length, dim) prediction_logits = self.vocab_projector(prediction_logits) # (bs, seq_length, vocab_size) mlm_loss = None if labels is not None: mlm_loss = self.mlm_loss_fct(prediction_logits.view(-1, prediction_logits.size(-1)), labels.view(-1)) if not return_dict: output = (prediction_logits,) + dlbrt_output[1:] return ((mlm_loss,) + output) if mlm_loss is not None else output return MaskedLMOutput( loss=mlm_loss, logits=prediction_logits, hidden_states=dlbrt_output.hidden_states, attentions=dlbrt_output.attentions, ) @add_start_docstrings( """ DistilBert Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled output) e.g. for GLUE tasks. """, DISTILBERT_START_DOCSTRING, ) class DistilBertForSequenceClassification(DistilBertPreTrainedModel): def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels self.distilbert = DistilBertModel(config) self.pre_classifier = nn.Linear(config.dim, config.dim) self.classifier = nn.Linear(config.dim, config.num_labels) self.dropout = nn.Dropout(config.seq_classif_dropout) self.init_weights() @add_start_docstrings_to_model_forward(DISTILBERT_INPUTS_DOCSTRING.format("batch_size, num_choices")) @add_code_sample_docstrings( tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint=_CHECKPOINT_FOR_DOC, output_type=SequenceClassifierOutput, config_class=_CONFIG_FOR_DOC, ) def forward( self, input_ids=None, attention_mask=None, head_mask=None, inputs_embeds=None, labels=None, output_attentions=None, output_hidden_states=None, return_dict=None, ): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`): Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ..., config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss), If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ return_dict = return_dict if return_dict is not None else self.config.use_return_dict distilbert_output = self.distilbert( input_ids=input_ids, attention_mask=attention_mask, head_mask=head_mask, inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict, ) hidden_state = distilbert_output[0] # (bs, seq_len, dim) pooled_output = hidden_state[:, 0] # (bs, dim) pooled_output = self.pre_classifier(pooled_output) # (bs, dim) pooled_output = nn.ReLU()(pooled_output) # (bs, dim) pooled_output = self.dropout(pooled_output) # (bs, dim) logits = self.classifier(pooled_output) # (bs, num_labels) loss = None if labels is not None: if self.num_labels == 1: loss_fct = nn.MSELoss() loss = loss_fct(logits.view(-1), labels.view(-1)) else: loss_fct = nn.CrossEntropyLoss() loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) if not return_dict: output = (logits,) + distilbert_output[1:] return ((loss,) + output) if loss is not None else output return SequenceClassifierOutput( loss=loss, logits=logits, hidden_states=distilbert_output.hidden_states, attentions=distilbert_output.attentions, ) @add_start_docstrings( """ DistilBert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of the hidden-states output to compute `span start logits` and `span end logits`). """, DISTILBERT_START_DOCSTRING, ) class DistilBertForQuestionAnswering(DistilBertPreTrainedModel): def __init__(self, config): super().__init__(config) self.distilbert = DistilBertModel(config) self.qa_outputs = nn.Linear(config.dim, config.num_labels) assert config.num_labels == 2 self.dropout = nn.Dropout(config.qa_dropout) self.init_weights() @add_start_docstrings_to_model_forward(DISTILBERT_INPUTS_DOCSTRING.format("batch_size, num_choices")) @add_code_sample_docstrings( tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint=_CHECKPOINT_FOR_DOC, output_type=QuestionAnsweringModelOutput, config_class=_CONFIG_FOR_DOC, ) def forward( self, input_ids=None, attention_mask=None, head_mask=None, inputs_embeds=None, start_positions=None, end_positions=None, output_attentions=None, output_hidden_states=None, return_dict=None, ): r""" start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`): Labels for position (index) of the start of the labelled span for computing the token classification loss. Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the sequence are not taken into account for computing the loss. end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`): Labels for position (index) of the end of the labelled span for computing the token classification loss. Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the sequence are not taken into account for computing the loss. """ return_dict = return_dict if return_dict is not None else self.config.use_return_dict distilbert_output = self.distilbert( input_ids=input_ids, attention_mask=attention_mask, head_mask=head_mask, inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict, ) hidden_states = distilbert_output[0] # (bs, max_query_len, dim) hidden_states = self.dropout(hidden_states) # (bs, max_query_len, dim) logits = self.qa_outputs(hidden_states) # (bs, max_query_len, 2) start_logits, end_logits = logits.split(1, dim=-1) start_logits = start_logits.squeeze(-1) # (bs, max_query_len) end_logits = end_logits.squeeze(-1) # (bs, max_query_len) total_loss = None if start_positions is not None and end_positions is not None: # If we are on multi-GPU, split add a dimension if len(start_positions.size()) > 1: start_positions = start_positions.squeeze(-1) if len(end_positions.size()) > 1: end_positions = end_positions.squeeze(-1) # sometimes the start/end positions are outside our model inputs, we ignore these terms ignored_index = start_logits.size(1) start_positions.clamp_(0, ignored_index) end_positions.clamp_(0, ignored_index) loss_fct = nn.CrossEntropyLoss(ignore_index=ignored_index) start_loss = loss_fct(start_logits, start_positions) end_loss = loss_fct(end_logits, end_positions) total_loss = (start_loss + end_loss) / 2 if not return_dict: output = (start_logits, end_logits) + distilbert_output[1:] return ((total_loss,) + output) if total_loss is not None else output return QuestionAnsweringModelOutput( loss=total_loss, start_logits=start_logits, end_logits=end_logits, hidden_states=distilbert_output.hidden_states, attentions=distilbert_output.attentions, ) @add_start_docstrings( """ DistilBert Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """, DISTILBERT_START_DOCSTRING, ) class DistilBertForTokenClassification(DistilBertPreTrainedModel): def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels self.distilbert = DistilBertModel(config) self.dropout = nn.Dropout(config.dropout) self.classifier = nn.Linear(config.hidden_size, config.num_labels) self.init_weights() @add_start_docstrings_to_model_forward(DISTILBERT_INPUTS_DOCSTRING) @add_code_sample_docstrings( tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint=_CHECKPOINT_FOR_DOC, output_type=TokenClassifierOutput, config_class=_CONFIG_FOR_DOC, ) def forward( self, input_ids=None, attention_mask=None, head_mask=None, inputs_embeds=None, labels=None, output_attentions=None, output_hidden_states=None, return_dict=None, ): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`): Labels for computing the token classification loss. Indices should be in ``[0, ..., config.num_labels - 1]``. """ return_dict = return_dict if return_dict is not None else self.config.use_return_dict outputs = self.distilbert( input_ids, attention_mask=attention_mask, head_mask=head_mask, inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict, ) sequence_output = outputs[0] sequence_output = self.dropout(sequence_output) logits = self.classifier(sequence_output) loss = None if labels is not None: loss_fct = CrossEntropyLoss() # Only keep active parts of the loss if attention_mask is not None: active_loss = attention_mask.view(-1) == 1 active_logits = logits.view(-1, self.num_labels) active_labels = torch.where( active_loss, labels.view(-1), torch.tensor(loss_fct.ignore_index).type_as(labels) ) loss = loss_fct(active_logits, active_labels) else: loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) if not return_dict: output = (logits,) + outputs[1:] return ((loss,) + output) if loss is not None else output return TokenClassifierOutput( loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions, ) @add_start_docstrings( """ DistilBert Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a softmax) e.g. for RocStories/SWAG tasks. """, DISTILBERT_START_DOCSTRING, ) class DistilBertForMultipleChoice(DistilBertPreTrainedModel): def __init__(self, config): super().__init__(config) self.distilbert = DistilBertModel(config) self.pre_classifier = nn.Linear(config.dim, config.dim) self.classifier = nn.Linear(config.dim, 1) self.dropout = nn.Dropout(config.seq_classif_dropout) self.init_weights() @add_start_docstrings_to_model_forward( DISTILBERT_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length") ) @replace_return_docstrings(output_type=MultipleChoiceModelOutput, config_class=_CONFIG_FOR_DOC) def forward( self, input_ids=None, attention_mask=None, head_mask=None, inputs_embeds=None, labels=None, output_attentions=None, output_hidden_states=None, return_dict=None, ): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`): Labels for computing the multiple choice classification loss. Indices should be in ``[0, ..., num_choices-1]`` where :obj:`num_choices` is the size of the second dimension of the input tensors. (See :obj:`input_ids` above) Returns: Examples:: >>> from transformers import DistilBertTokenizer, DistilBertForMultipleChoice >>> import torch >>> tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased') >>> model = DistilBertForMultipleChoice.from_pretrained('distilbert-base-cased') >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced." >>> choice0 = "It is eaten with a fork and a knife." >>> choice1 = "It is eaten while held in the hand." >>> labels = torch.tensor(0).unsqueeze(0) # choice0 is correct (according to Wikipedia ;)), batch size 1 >>> encoding = tokenizer([[prompt, choice0], [prompt, choice1]], return_tensors='pt', padding=True) >>> outputs = model(**{k: v.unsqueeze(0) for k,v in encoding.items()}, labels=labels) # batch size is 1 >>> # the linear classifier still needs to be trained >>> loss = outputs.loss >>> logits = outputs.logits """ return_dict = return_dict if return_dict is not None else self.config.use_return_dict num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1] input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None inputs_embeds = ( inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1)) if inputs_embeds is not None else None ) outputs = self.distilbert( input_ids, attention_mask=attention_mask, head_mask=head_mask, inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict, ) hidden_state = outputs[0] # (bs * num_choices, seq_len, dim) pooled_output = hidden_state[:, 0] # (bs * num_choices, dim) pooled_output = self.pre_classifier(pooled_output) # (bs * num_choices, dim) pooled_output = nn.ReLU()(pooled_output) # (bs * num_choices, dim) pooled_output = self.dropout(pooled_output) # (bs * num_choices, dim) logits = self.classifier(pooled_output) # (bs * num_choices, 1) reshaped_logits = logits.view(-1, num_choices) # (bs, num_choices) loss = None if labels is not None: loss_fct = CrossEntropyLoss() loss = loss_fct(reshaped_logits, labels) if not return_dict: output = (reshaped_logits,) + outputs[1:] return ((loss,) + output) if loss is not None else output return MultipleChoiceModelOutput( loss=loss, logits=reshaped_logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions, ) ================================================ FILE: patches/transformers/4.5.1/convert_graph_to_onnx.diff ================================================ 14a15,17 > import os > import json > 83a87,91 > "--save-config", > action="store_true", > help="Save the model configuration along with the ONNX", > ) > self.add_argument( 280a289,295 > print('Exporting from PyTorch to ONNX...') > print('input_names', input_names) > print('output_names', output_names) > print('dynamic_axes', dynamic_axes) > print('tokens', tokens) > print('model_args', model_args) > 291a307 > verbose=True 339a356 > save_config: bool = False, 366,367c383,384 < elif len(listdir(output.parent.as_posix())) > 0: < raise Exception(f"Folder {output.parent.as_posix()} is not empty, aborting conversion") --- > #elif len(listdir(output.parent.as_posix())) > 0: > # raise Exception(f"Folder {output.parent.as_posix()} is not empty, aborting conversion") 374c391,407 < --- > > # Save the configuration > if save_config: > config_path = os.path.splitext(output)[0] + '.json' > > config = dict( > model = nlp.model.config.to_dict(), > tokenizer = nlp.tokenizer.init_kwargs > ) > > #nlp.model.config.to_json_file(config_path) > > with open(config_path, 'w') as config_file: > json.dump(config, config_file, indent=2) > > print(f"Saved config to {config_path}") > 468a502 > args.save_config ================================================ FILE: patches/transformers/4.5.1/convert_graph_to_onnx.original.py ================================================ # Copyright 2020 The HuggingFace Team. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from argparse import ArgumentParser from os import listdir, makedirs from pathlib import Path from typing import Dict, List, Optional, Tuple from packaging.version import Version, parse from transformers.file_utils import ModelOutput, is_tf_available, is_torch_available from transformers.pipelines import Pipeline, pipeline from transformers.tokenization_utils import BatchEncoding # This is the minimal required version to # support some ONNX Runtime features ORT_QUANTIZE_MINIMUM_VERSION = parse("1.4.0") SUPPORTED_PIPELINES = [ "feature-extraction", "ner", "sentiment-analysis", "fill-mask", "question-answering", "text-generation", "translation_en_to_fr", "translation_en_to_de", "translation_en_to_ro", ] class OnnxConverterArgumentParser(ArgumentParser): """ Wraps all the script arguments supported to export transformers models to ONNX IR """ def __init__(self): super().__init__("ONNX Converter") self.add_argument( "--pipeline", type=str, choices=SUPPORTED_PIPELINES, default="feature-extraction", ) self.add_argument( "--model", type=str, required=True, help="Model's id or path (ex: bert-base-cased)", ) self.add_argument("--tokenizer", type=str, help="Tokenizer's id or path (ex: bert-base-cased)") self.add_argument( "--framework", type=str, choices=["pt", "tf"], help="Framework for loading the model", ) self.add_argument("--opset", type=int, default=11, help="ONNX opset to use") self.add_argument( "--check-loading", action="store_true", help="Check ONNX is able to load the model", ) self.add_argument( "--use-external-format", action="store_true", help="Allow exporting model >= than 2Gb", ) self.add_argument( "--quantize", action="store_true", help="Quantize the neural network to be run with int8", ) self.add_argument("output") def generate_identified_filename(filename: Path, identifier: str) -> Path: """ Append a string-identifier at the end (before the extension, if any) to the provided filepath Args: filename: pathlib.Path The actual path object we would like to add an identifier suffix identifier: The suffix to add Returns: String with concatenated identifier at the end of the filename """ return filename.parent.joinpath(filename.stem + identifier).with_suffix(filename.suffix) def check_onnxruntime_requirements(minimum_version: Version): """ Check onnxruntime is installed and if the installed version match is recent enough Raises: ImportError: If onnxruntime is not installed or too old version is found """ try: import onnxruntime # Parse the version of the installed onnxruntime ort_version = parse(onnxruntime.__version__) # We require 1.4.0 minimum if ort_version < ORT_QUANTIZE_MINIMUM_VERSION: raise ImportError( f"We found an older version of onnxruntime ({onnxruntime.__version__}) " f"but we require onnxruntime to be >= {minimum_version} to enable all the conversions options.\n" f"Please update onnxruntime by running `pip install --upgrade onnxruntime`" ) except ImportError: raise ImportError( "onnxruntime doesn't seem to be currently installed. " "Please install the onnxruntime by running `pip install onnxruntime`" " and relaunch the conversion." ) def ensure_valid_input(model, tokens, input_names): """ Ensure input are presented in the correct order, without any Non Args: model: The model used to forward the input data tokens: BatchEncoding holding the input data input_names: The name of the inputs Returns: Tuple """ print("Ensuring inputs are in correct order") model_args_name = model.forward.__code__.co_varnames model_args, ordered_input_names = [], [] for arg_name in model_args_name[1:]: # start at index 1 to skip "self" argument if arg_name in input_names: ordered_input_names.append(arg_name) model_args.append(tokens[arg_name]) else: print(f"{arg_name} is not present in the generated input list.") break print(f"Generated inputs order: {ordered_input_names}") return ordered_input_names, tuple(model_args) def infer_shapes(nlp: Pipeline, framework: str) -> Tuple[List[str], List[str], Dict, BatchEncoding]: """ Attempt to infer the static vs dynamic axes for each input and output tensors for a specific model Args: nlp: The pipeline object holding the model to be exported framework: The framework identifier to dispatch to the correct inference scheme (pt/tf) Returns: - List of the inferred input variable names - List of the inferred output variable names - Dictionary with input/output variables names as key and shape tensor as value - a BatchEncoding reference which was used to infer all the above information """ def build_shape_dict(name: str, tensor, is_input: bool, seq_len: int): if isinstance(tensor, (tuple, list)): return [build_shape_dict(name, t, is_input, seq_len) for t in tensor] else: # Let's assume batch is the first axis with only 1 element (~~ might not be always true ...) axes = {[axis for axis, numel in enumerate(tensor.shape) if numel == 1][0]: "batch"} if is_input: if len(tensor.shape) == 2: axes[1] = "sequence" else: raise ValueError(f"Unable to infer tensor axes ({len(tensor.shape)})") else: seq_axes = [dim for dim, shape in enumerate(tensor.shape) if shape == seq_len] axes.update({dim: "sequence" for dim in seq_axes}) print(f"Found {'input' if is_input else 'output'} {name} with shape: {axes}") return axes tokens = nlp.tokenizer("This is a sample output", return_tensors=framework) seq_len = tokens.input_ids.shape[-1] outputs = nlp.model(**tokens) if framework == "pt" else nlp.model(tokens) if isinstance(outputs, ModelOutput): outputs = outputs.to_tuple() if not isinstance(outputs, (list, tuple)): outputs = (outputs,) # Generate input names & axes input_vars = list(tokens.keys()) input_dynamic_axes = {k: build_shape_dict(k, v, True, seq_len) for k, v in tokens.items()} # flatten potentially grouped outputs (past for gpt2, attentions) outputs_flat = [] for output in outputs: if isinstance(output, (tuple, list)): outputs_flat.extend(output) else: outputs_flat.append(output) # Generate output names & axes output_names = [f"output_{i}" for i in range(len(outputs_flat))] output_dynamic_axes = {k: build_shape_dict(k, v, False, seq_len) for k, v in zip(output_names, outputs_flat)} # Create the aggregated axes representation dynamic_axes = dict(input_dynamic_axes, **output_dynamic_axes) return input_vars, output_names, dynamic_axes, tokens def load_graph_from_args( pipeline_name: str, framework: str, model: str, tokenizer: Optional[str] = None, **models_kwargs ) -> Pipeline: """ Convert the set of arguments provided through the CLI to an actual pipeline reference (tokenizer + model Args: pipeline_name: The kind of pipeline to use (ner, question-answering, etc.) framework: The actual model to convert the pipeline from ("pt" or "tf") model: The model name which will be loaded by the pipeline tokenizer: The tokenizer name which will be loaded by the pipeline, default to the model's value Returns: Pipeline object """ # If no tokenizer provided if tokenizer is None: tokenizer = model # Check the wanted framework is available if framework == "pt" and not is_torch_available(): raise Exception("Cannot convert because PyTorch is not installed. Please install torch first.") if framework == "tf" and not is_tf_available(): raise Exception("Cannot convert because TF is not installed. Please install tensorflow first.") print(f"Loading pipeline (model: {model}, tokenizer: {tokenizer})") # Allocate tokenizer and model return pipeline(pipeline_name, model=model, tokenizer=tokenizer, framework=framework, model_kwargs=models_kwargs) def convert_pytorch(nlp: Pipeline, opset: int, output: Path, use_external_format: bool): """ Export a PyTorch backed pipeline to ONNX Intermediate Representation (IR Args: nlp: The pipeline to be exported opset: The actual version of the ONNX operator set to use output: Path where will be stored the generated ONNX model use_external_format: Split the model definition from its parameters to allow model bigger than 2GB Returns: """ if not is_torch_available(): raise Exception("Cannot convert because PyTorch is not installed. Please install torch first.") import torch from torch.onnx import export print(f"Using framework PyTorch: {torch.__version__}") with torch.no_grad(): input_names, output_names, dynamic_axes, tokens = infer_shapes(nlp, "pt") ordered_input_names, model_args = ensure_valid_input(nlp.model, tokens, input_names) export( nlp.model, model_args, f=output.as_posix(), input_names=ordered_input_names, output_names=output_names, dynamic_axes=dynamic_axes, do_constant_folding=True, use_external_data_format=use_external_format, enable_onnx_checker=True, opset_version=opset, ) def convert_tensorflow(nlp: Pipeline, opset: int, output: Path): """ Export a TensorFlow backed pipeline to ONNX Intermediate Representation (IR Args: nlp: The pipeline to be exported opset: The actual version of the ONNX operator set to use output: Path where will be stored the generated ONNX model Notes: TensorFlow cannot export model bigger than 2GB due to internal constraint from TensorFlow """ if not is_tf_available(): raise Exception("Cannot convert because TF is not installed. Please install tensorflow first.") print("/!\\ Please note TensorFlow doesn't support exporting model > 2Gb /!\\") try: import tensorflow as tf from keras2onnx import __version__ as k2ov from keras2onnx import convert_keras, save_model print(f"Using framework TensorFlow: {tf.version.VERSION}, keras2onnx: {k2ov}") # Build input_names, output_names, dynamic_axes, tokens = infer_shapes(nlp, "tf") # Forward nlp.model.predict(tokens.data) onnx_model = convert_keras(nlp.model, nlp.model.name, target_opset=opset) save_model(onnx_model, output.as_posix()) except ImportError as e: raise Exception(f"Cannot import {e.name} required to convert TF model to ONNX. Please install {e.name} first.") def convert( framework: str, model: str, output: Path, opset: int, tokenizer: Optional[str] = None, use_external_format: bool = False, pipeline_name: str = "feature-extraction", **model_kwargs ): """ Convert the pipeline object to the ONNX Intermediate Representation (IR) format Args: framework: The framework the pipeline is backed by ("pt" or "tf") model: The name of the model to load for the pipeline output: The path where the ONNX graph will be stored opset: The actual version of the ONNX operator set to use tokenizer: The name of the model to load for the pipeline, default to the model's name if not provided use_external_format: Split the model definition from its parameters to allow model bigger than 2GB (PyTorch only) pipeline_name: The kind of pipeline to instantiate (ner, question-answering, etc.) model_kwargs: Keyword arguments to be forwarded to the model constructor Returns: """ print(f"ONNX opset version set to: {opset}") # Load the pipeline nlp = load_graph_from_args(pipeline_name, framework, model, tokenizer, **model_kwargs) if not output.parent.exists(): print(f"Creating folder {output.parent}") makedirs(output.parent.as_posix()) elif len(listdir(output.parent.as_posix())) > 0: raise Exception(f"Folder {output.parent.as_posix()} is not empty, aborting conversion") # Export the graph if framework == "pt": convert_pytorch(nlp, opset, output, use_external_format) else: convert_tensorflow(nlp, opset, output) def optimize(onnx_model_path: Path) -> Path: """ Load the model at the specified path and let onnxruntime look at transformations on the graph to enable all the optimizations possibl Args: onnx_model_path: filepath where the model binary description is stored Returns: Path where the optimized model binary description has been saved """ from onnxruntime import InferenceSession, SessionOptions # Generate model name with suffix "optimized" opt_model_path = generate_identified_filename(onnx_model_path, "-optimized") sess_option = SessionOptions() sess_option.optimized_model_filepath = opt_model_path.as_posix() _ = InferenceSession(onnx_model_path.as_posix(), sess_option) print(f"Optimized model has been written at {opt_model_path}: \N{heavy check mark}") print("/!\\ Optimized model contains hardware specific operators which might not be portable. /!\\") return opt_model_path def quantize(onnx_model_path: Path) -> Path: """ Quantize the weights of the model from float32 to in8 to allow very efficient inference on modern CPU Args: onnx_model_path: Path to location the exported ONNX model is stored Returns: The Path generated for the quantized """ import onnx from onnxruntime.quantization import QuantizationMode, quantize onnx_model = onnx.load(onnx_model_path.as_posix()) # Discussed with @yufenglee from ONNX runtime, this will be address in the next release of onnxruntime print( "As of onnxruntime 1.4.0, models larger than 2GB will fail to quantize due to protobuf constraint.\n" "This limitation will be removed in the next release of onnxruntime." ) quantized_model = quantize( model=onnx_model, quantization_mode=QuantizationMode.IntegerOps, force_fusions=True, symmetric_weight=True, ) # Append "-quantized" at the end of the model's name quantized_model_path = generate_identified_filename(onnx_model_path, "-quantized") # Save model print(f"Quantized model has been written at {quantized_model_path}: \N{heavy check mark}") onnx.save_model(quantized_model, quantized_model_path.as_posix()) return quantized_model_path def verify(path: Path): from onnxruntime import InferenceSession, SessionOptions from onnxruntime.capi.onnxruntime_pybind11_state import RuntimeException print(f"Checking ONNX model loading from: {path} ...") try: onnx_options = SessionOptions() _ = InferenceSession(path.as_posix(), onnx_options, providers=["CPUExecutionProvider"]) print(f"Model {path} correctly loaded: \N{heavy check mark}") except RuntimeException as re: print(f"Error while loading the model {re}: \N{heavy ballot x}") if __name__ == "__main__": parser = OnnxConverterArgumentParser() args = parser.parse_args() # Make sure output is absolute path args.output = Path(args.output).absolute() try: print("\n====== Converting model to ONNX ======") # Convert convert( args.framework, args.model, args.output, args.opset, args.tokenizer, args.use_external_format, args.pipeline, ) if args.quantize: # Ensure requirements for quantization on onnxruntime is met check_onnxruntime_requirements(ORT_QUANTIZE_MINIMUM_VERSION) # onnxruntime optimizations doesn't provide the same level of performances on TensorFlow than PyTorch if args.framework == "tf": print( "\t Using TensorFlow might not provide the same optimization level compared to PyTorch.\n" "\t For TensorFlow users you can try optimizing the model directly through onnxruntime_tools.\n" "\t For more information, please refer to the onnxruntime documentation:\n" "\t\thttps://github.com/microsoft/onnxruntime/tree/master/onnxruntime/python/tools/transformers\n" ) print("\n====== Optimizing ONNX model ======") # Quantization works best when using the optimized version of the model args.optimized_output = optimize(args.output) # Do the quantization on the right graph args.quantized_output = quantize(args.optimized_output) # And verify if args.check_loading: print("\n====== Check exported ONNX model(s) ======") verify(args.output) if hasattr(args, "optimized_output"): verify(args.optimized_output) if hasattr(args, "quantized_output"): verify(args.quantized_output) except Exception as e: print(f"Error while converting the model: {e}") exit(1) ================================================ FILE: patches/transformers/4.5.1/convert_graph_to_onnx.py ================================================ # Copyright 2020 The HuggingFace Team. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import os import json from argparse import ArgumentParser from os import listdir, makedirs from pathlib import Path from typing import Dict, List, Optional, Tuple from packaging.version import Version, parse from transformers.file_utils import ModelOutput, is_tf_available, is_torch_available from transformers.pipelines import Pipeline, pipeline from transformers.tokenization_utils import BatchEncoding # This is the minimal required version to # support some ONNX Runtime features ORT_QUANTIZE_MINIMUM_VERSION = parse("1.4.0") SUPPORTED_PIPELINES = [ "feature-extraction", "ner", "sentiment-analysis", "fill-mask", "question-answering", "text-generation", "translation_en_to_fr", "translation_en_to_de", "translation_en_to_ro", ] class OnnxConverterArgumentParser(ArgumentParser): """ Wraps all the script arguments supported to export transformers models to ONNX IR """ def __init__(self): super().__init__("ONNX Converter") self.add_argument( "--pipeline", type=str, choices=SUPPORTED_PIPELINES, default="feature-extraction", ) self.add_argument( "--model", type=str, required=True, help="Model's id or path (ex: bert-base-cased)", ) self.add_argument("--tokenizer", type=str, help="Tokenizer's id or path (ex: bert-base-cased)") self.add_argument( "--framework", type=str, choices=["pt", "tf"], help="Framework for loading the model", ) self.add_argument("--opset", type=int, default=11, help="ONNX opset to use") self.add_argument( "--check-loading", action="store_true", help="Check ONNX is able to load the model", ) self.add_argument( "--use-external-format", action="store_true", help="Allow exporting model >= than 2Gb", ) self.add_argument( "--save-config", action="store_true", help="Save the model configuration along with the ONNX", ) self.add_argument( "--quantize", action="store_true", help="Quantize the neural network to be run with int8", ) self.add_argument("output") def generate_identified_filename(filename: Path, identifier: str) -> Path: """ Append a string-identifier at the end (before the extension, if any) to the provided filepath Args: filename: pathlib.Path The actual path object we would like to add an identifier suffix identifier: The suffix to add Returns: String with concatenated identifier at the end of the filename """ return filename.parent.joinpath(filename.stem + identifier).with_suffix(filename.suffix) def check_onnxruntime_requirements(minimum_version: Version): """ Check onnxruntime is installed and if the installed version match is recent enough Raises: ImportError: If onnxruntime is not installed or too old version is found """ try: import onnxruntime # Parse the version of the installed onnxruntime ort_version = parse(onnxruntime.__version__) # We require 1.4.0 minimum if ort_version < ORT_QUANTIZE_MINIMUM_VERSION: raise ImportError( f"We found an older version of onnxruntime ({onnxruntime.__version__}) " f"but we require onnxruntime to be >= {minimum_version} to enable all the conversions options.\n" f"Please update onnxruntime by running `pip install --upgrade onnxruntime`" ) except ImportError: raise ImportError( "onnxruntime doesn't seem to be currently installed. " "Please install the onnxruntime by running `pip install onnxruntime`" " and relaunch the conversion." ) def ensure_valid_input(model, tokens, input_names): """ Ensure input are presented in the correct order, without any Non Args: model: The model used to forward the input data tokens: BatchEncoding holding the input data input_names: The name of the inputs Returns: Tuple """ print("Ensuring inputs are in correct order") model_args_name = model.forward.__code__.co_varnames model_args, ordered_input_names = [], [] for arg_name in model_args_name[1:]: # start at index 1 to skip "self" argument if arg_name in input_names: ordered_input_names.append(arg_name) model_args.append(tokens[arg_name]) else: print(f"{arg_name} is not present in the generated input list.") break print(f"Generated inputs order: {ordered_input_names}") return ordered_input_names, tuple(model_args) def infer_shapes(nlp: Pipeline, framework: str) -> Tuple[List[str], List[str], Dict, BatchEncoding]: """ Attempt to infer the static vs dynamic axes for each input and output tensors for a specific model Args: nlp: The pipeline object holding the model to be exported framework: The framework identifier to dispatch to the correct inference scheme (pt/tf) Returns: - List of the inferred input variable names - List of the inferred output variable names - Dictionary with input/output variables names as key and shape tensor as value - a BatchEncoding reference which was used to infer all the above information """ def build_shape_dict(name: str, tensor, is_input: bool, seq_len: int): if isinstance(tensor, (tuple, list)): return [build_shape_dict(name, t, is_input, seq_len) for t in tensor] else: # Let's assume batch is the first axis with only 1 element (~~ might not be always true ...) axes = {[axis for axis, numel in enumerate(tensor.shape) if numel == 1][0]: "batch"} if is_input: if len(tensor.shape) == 2: axes[1] = "sequence" else: raise ValueError(f"Unable to infer tensor axes ({len(tensor.shape)})") else: seq_axes = [dim for dim, shape in enumerate(tensor.shape) if shape == seq_len] axes.update({dim: "sequence" for dim in seq_axes}) print(f"Found {'input' if is_input else 'output'} {name} with shape: {axes}") return axes tokens = nlp.tokenizer("This is a sample output", return_tensors=framework) seq_len = tokens.input_ids.shape[-1] outputs = nlp.model(**tokens) if framework == "pt" else nlp.model(tokens) if isinstance(outputs, ModelOutput): outputs = outputs.to_tuple() if not isinstance(outputs, (list, tuple)): outputs = (outputs,) # Generate input names & axes input_vars = list(tokens.keys()) input_dynamic_axes = {k: build_shape_dict(k, v, True, seq_len) for k, v in tokens.items()} # flatten potentially grouped outputs (past for gpt2, attentions) outputs_flat = [] for output in outputs: if isinstance(output, (tuple, list)): outputs_flat.extend(output) else: outputs_flat.append(output) # Generate output names & axes output_names = [f"output_{i}" for i in range(len(outputs_flat))] output_dynamic_axes = {k: build_shape_dict(k, v, False, seq_len) for k, v in zip(output_names, outputs_flat)} # Create the aggregated axes representation dynamic_axes = dict(input_dynamic_axes, **output_dynamic_axes) return input_vars, output_names, dynamic_axes, tokens def load_graph_from_args( pipeline_name: str, framework: str, model: str, tokenizer: Optional[str] = None, **models_kwargs ) -> Pipeline: """ Convert the set of arguments provided through the CLI to an actual pipeline reference (tokenizer + model Args: pipeline_name: The kind of pipeline to use (ner, question-answering, etc.) framework: The actual model to convert the pipeline from ("pt" or "tf") model: The model name which will be loaded by the pipeline tokenizer: The tokenizer name which will be loaded by the pipeline, default to the model's value Returns: Pipeline object """ # If no tokenizer provided if tokenizer is None: tokenizer = model # Check the wanted framework is available if framework == "pt" and not is_torch_available(): raise Exception("Cannot convert because PyTorch is not installed. Please install torch first.") if framework == "tf" and not is_tf_available(): raise Exception("Cannot convert because TF is not installed. Please install tensorflow first.") print(f"Loading pipeline (model: {model}, tokenizer: {tokenizer})") # Allocate tokenizer and model return pipeline(pipeline_name, model=model, tokenizer=tokenizer, framework=framework, model_kwargs=models_kwargs) def convert_pytorch(nlp: Pipeline, opset: int, output: Path, use_external_format: bool): """ Export a PyTorch backed pipeline to ONNX Intermediate Representation (IR Args: nlp: The pipeline to be exported opset: The actual version of the ONNX operator set to use output: Path where will be stored the generated ONNX model use_external_format: Split the model definition from its parameters to allow model bigger than 2GB Returns: """ if not is_torch_available(): raise Exception("Cannot convert because PyTorch is not installed. Please install torch first.") import torch from torch.onnx import export print(f"Using framework PyTorch: {torch.__version__}") with torch.no_grad(): input_names, output_names, dynamic_axes, tokens = infer_shapes(nlp, "pt") ordered_input_names, model_args = ensure_valid_input(nlp.model, tokens, input_names) print('Exporting from PyTorch to ONNX...') print('input_names', input_names) print('output_names', output_names) print('dynamic_axes', dynamic_axes) print('tokens', tokens) print('model_args', model_args) export( nlp.model, model_args, f=output.as_posix(), input_names=ordered_input_names, output_names=output_names, dynamic_axes=dynamic_axes, do_constant_folding=True, use_external_data_format=use_external_format, enable_onnx_checker=True, opset_version=opset, verbose=True ) def convert_tensorflow(nlp: Pipeline, opset: int, output: Path): """ Export a TensorFlow backed pipeline to ONNX Intermediate Representation (IR Args: nlp: The pipeline to be exported opset: The actual version of the ONNX operator set to use output: Path where will be stored the generated ONNX model Notes: TensorFlow cannot export model bigger than 2GB due to internal constraint from TensorFlow """ if not is_tf_available(): raise Exception("Cannot convert because TF is not installed. Please install tensorflow first.") print("/!\\ Please note TensorFlow doesn't support exporting model > 2Gb /!\\") try: import tensorflow as tf from keras2onnx import __version__ as k2ov from keras2onnx import convert_keras, save_model print(f"Using framework TensorFlow: {tf.version.VERSION}, keras2onnx: {k2ov}") # Build input_names, output_names, dynamic_axes, tokens = infer_shapes(nlp, "tf") # Forward nlp.model.predict(tokens.data) onnx_model = convert_keras(nlp.model, nlp.model.name, target_opset=opset) save_model(onnx_model, output.as_posix()) except ImportError as e: raise Exception(f"Cannot import {e.name} required to convert TF model to ONNX. Please install {e.name} first.") def convert( framework: str, model: str, output: Path, opset: int, tokenizer: Optional[str] = None, use_external_format: bool = False, pipeline_name: str = "feature-extraction", save_config: bool = False, **model_kwargs ): """ Convert the pipeline object to the ONNX Intermediate Representation (IR) format Args: framework: The framework the pipeline is backed by ("pt" or "tf") model: The name of the model to load for the pipeline output: The path where the ONNX graph will be stored opset: The actual version of the ONNX operator set to use tokenizer: The name of the model to load for the pipeline, default to the model's name if not provided use_external_format: Split the model definition from its parameters to allow model bigger than 2GB (PyTorch only) pipeline_name: The kind of pipeline to instantiate (ner, question-answering, etc.) model_kwargs: Keyword arguments to be forwarded to the model constructor Returns: """ print(f"ONNX opset version set to: {opset}") # Load the pipeline nlp = load_graph_from_args(pipeline_name, framework, model, tokenizer, **model_kwargs) if not output.parent.exists(): print(f"Creating folder {output.parent}") makedirs(output.parent.as_posix()) #elif len(listdir(output.parent.as_posix())) > 0: # raise Exception(f"Folder {output.parent.as_posix()} is not empty, aborting conversion") # Export the graph if framework == "pt": convert_pytorch(nlp, opset, output, use_external_format) else: convert_tensorflow(nlp, opset, output) # Save the configuration if save_config: config_path = os.path.splitext(output)[0] + '.json' config = dict( model = nlp.model.config.to_dict(), tokenizer = nlp.tokenizer.init_kwargs ) #nlp.model.config.to_json_file(config_path) with open(config_path, 'w') as config_file: json.dump(config, config_file, indent=2) print(f"Saved config to {config_path}") def optimize(onnx_model_path: Path) -> Path: """ Load the model at the specified path and let onnxruntime look at transformations on the graph to enable all the optimizations possibl Args: onnx_model_path: filepath where the model binary description is stored Returns: Path where the optimized model binary description has been saved """ from onnxruntime import InferenceSession, SessionOptions # Generate model name with suffix "optimized" opt_model_path = generate_identified_filename(onnx_model_path, "-optimized") sess_option = SessionOptions() sess_option.optimized_model_filepath = opt_model_path.as_posix() _ = InferenceSession(onnx_model_path.as_posix(), sess_option) print(f"Optimized model has been written at {opt_model_path}: \N{heavy check mark}") print("/!\\ Optimized model contains hardware specific operators which might not be portable. /!\\") return opt_model_path def quantize(onnx_model_path: Path) -> Path: """ Quantize the weights of the model from float32 to in8 to allow very efficient inference on modern CPU Args: onnx_model_path: Path to location the exported ONNX model is stored Returns: The Path generated for the quantized """ import onnx from onnxruntime.quantization import QuantizationMode, quantize onnx_model = onnx.load(onnx_model_path.as_posix()) # Discussed with @yufenglee from ONNX runtime, this will be address in the next release of onnxruntime print( "As of onnxruntime 1.4.0, models larger than 2GB will fail to quantize due to protobuf constraint.\n" "This limitation will be removed in the next release of onnxruntime." ) quantized_model = quantize( model=onnx_model, quantization_mode=QuantizationMode.IntegerOps, force_fusions=True, symmetric_weight=True, ) # Append "-quantized" at the end of the model's name quantized_model_path = generate_identified_filename(onnx_model_path, "-quantized") # Save model print(f"Quantized model has been written at {quantized_model_path}: \N{heavy check mark}") onnx.save_model(quantized_model, quantized_model_path.as_posix()) return quantized_model_path def verify(path: Path): from onnxruntime import InferenceSession, SessionOptions from onnxruntime.capi.onnxruntime_pybind11_state import RuntimeException print(f"Checking ONNX model loading from: {path} ...") try: onnx_options = SessionOptions() _ = InferenceSession(path.as_posix(), onnx_options, providers=["CPUExecutionProvider"]) print(f"Model {path} correctly loaded: \N{heavy check mark}") except RuntimeException as re: print(f"Error while loading the model {re}: \N{heavy ballot x}") if __name__ == "__main__": parser = OnnxConverterArgumentParser() args = parser.parse_args() # Make sure output is absolute path args.output = Path(args.output).absolute() try: print("\n====== Converting model to ONNX ======") # Convert convert( args.framework, args.model, args.output, args.opset, args.tokenizer, args.use_external_format, args.pipeline, args.save_config ) if args.quantize: # Ensure requirements for quantization on onnxruntime is met check_onnxruntime_requirements(ORT_QUANTIZE_MINIMUM_VERSION) # onnxruntime optimizations doesn't provide the same level of performances on TensorFlow than PyTorch if args.framework == "tf": print( "\t Using TensorFlow might not provide the same optimization level compared to PyTorch.\n" "\t For TensorFlow users you can try optimizing the model directly through onnxruntime_tools.\n" "\t For more information, please refer to the onnxruntime documentation:\n" "\t\thttps://github.com/microsoft/onnxruntime/tree/master/onnxruntime/python/tools/transformers\n" ) print("\n====== Optimizing ONNX model ======") # Quantization works best when using the optimized version of the model args.optimized_output = optimize(args.output) # Do the quantization on the right graph args.quantized_output = quantize(args.optimized_output) # And verify if args.check_loading: print("\n====== Check exported ONNX model(s) ======") verify(args.output) if hasattr(args, "optimized_output"): verify(args.optimized_output) if hasattr(args, "quantized_output"): verify(args.quantized_output) except Exception as e: print(f"Error while converting the model: {e}") exit(1) ================================================ FILE: patches/transformers/4.5.1/modeling_distilbert.diff ================================================ 183,184c183,184 < mask = (mask == 0).view(mask_reshp).expand_as(scores) # (bs, n_heads, q_length, k_length) < scores.masked_fill_(mask, -float("inf")) # (bs, n_heads, q_length, k_length) --- > mask = mask.view(mask_reshp).expand_as(scores) # (bs, n_heads, q_length, k_length) > scores.masked_fill_((mask == 0), -float("inf")) # (bs, n_heads, q_length, k_length) ================================================ FILE: patches/transformers/4.5.1/modeling_distilbert.original.py ================================================ # coding=utf-8 # Copyright 2019-present, the HuggingFace Inc. team, The Google AI Language Team and Facebook, Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ PyTorch DistilBERT model adapted in part from Facebook, Inc XLM model (https://github.com/facebookresearch/XLM) and in part from HuggingFace PyTorch version of Google AI Bert model (https://github.com/google-research/bert) """ import copy import math import numpy as np import torch import torch.nn as nn from torch.nn import CrossEntropyLoss from ...activations import gelu from ...file_utils import ( add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, replace_return_docstrings, ) from ...modeling_outputs import ( BaseModelOutput, MaskedLMOutput, MultipleChoiceModelOutput, QuestionAnsweringModelOutput, SequenceClassifierOutput, TokenClassifierOutput, ) from ...modeling_utils import ( PreTrainedModel, apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer, ) from ...utils import logging from .configuration_distilbert import DistilBertConfig logger = logging.get_logger(__name__) _CHECKPOINT_FOR_DOC = "distilbert-base-uncased" _CONFIG_FOR_DOC = "DistilBertConfig" _TOKENIZER_FOR_DOC = "DistilBertTokenizer" DISTILBERT_PRETRAINED_MODEL_ARCHIVE_LIST = [ "distilbert-base-uncased", "distilbert-base-uncased-distilled-squad", "distilbert-base-cased", "distilbert-base-cased-distilled-squad", "distilbert-base-german-cased", "distilbert-base-multilingual-cased", "distilbert-base-uncased-finetuned-sst-2-english", # See all DistilBERT models at https://huggingface.co/models?filter=distilbert ] # UTILS AND BUILDING BLOCKS OF THE ARCHITECTURE # def create_sinusoidal_embeddings(n_pos, dim, out): position_enc = np.array([[pos / np.power(10000, 2 * (j // 2) / dim) for j in range(dim)] for pos in range(n_pos)]) out.requires_grad = False out[:, 0::2] = torch.FloatTensor(np.sin(position_enc[:, 0::2])) out[:, 1::2] = torch.FloatTensor(np.cos(position_enc[:, 1::2])) out.detach_() class Embeddings(nn.Module): def __init__(self, config): super().__init__() self.word_embeddings = nn.Embedding(config.vocab_size, config.dim, padding_idx=config.pad_token_id) self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.dim) if config.sinusoidal_pos_embds: create_sinusoidal_embeddings( n_pos=config.max_position_embeddings, dim=config.dim, out=self.position_embeddings.weight ) self.LayerNorm = nn.LayerNorm(config.dim, eps=1e-12) self.dropout = nn.Dropout(config.dropout) def forward(self, input_ids): """ Parameters: input_ids: torch.tensor(bs, max_seq_length) The token ids to embed. Returns: torch.tensor(bs, max_seq_length, dim) The embedded tokens (plus position embeddings, no token_type embeddings) """ seq_length = input_ids.size(1) position_ids = torch.arange(seq_length, dtype=torch.long, device=input_ids.device) # (max_seq_length) position_ids = position_ids.unsqueeze(0).expand_as(input_ids) # (bs, max_seq_length) word_embeddings = self.word_embeddings(input_ids) # (bs, max_seq_length, dim) position_embeddings = self.position_embeddings(position_ids) # (bs, max_seq_length, dim) embeddings = word_embeddings + position_embeddings # (bs, max_seq_length, dim) embeddings = self.LayerNorm(embeddings) # (bs, max_seq_length, dim) embeddings = self.dropout(embeddings) # (bs, max_seq_length, dim) return embeddings class MultiHeadSelfAttention(nn.Module): def __init__(self, config): super().__init__() self.n_heads = config.n_heads self.dim = config.dim self.dropout = nn.Dropout(p=config.attention_dropout) assert self.dim % self.n_heads == 0 self.q_lin = nn.Linear(in_features=config.dim, out_features=config.dim) self.k_lin = nn.Linear(in_features=config.dim, out_features=config.dim) self.v_lin = nn.Linear(in_features=config.dim, out_features=config.dim) self.out_lin = nn.Linear(in_features=config.dim, out_features=config.dim) self.pruned_heads = set() def prune_heads(self, heads): attention_head_size = self.dim // self.n_heads if len(heads) == 0: return heads, index = find_pruneable_heads_and_indices(heads, self.n_heads, attention_head_size, self.pruned_heads) # Prune linear layers self.q_lin = prune_linear_layer(self.q_lin, index) self.k_lin = prune_linear_layer(self.k_lin, index) self.v_lin = prune_linear_layer(self.v_lin, index) self.out_lin = prune_linear_layer(self.out_lin, index, dim=1) # Update hyper params self.n_heads = self.n_heads - len(heads) self.dim = attention_head_size * self.n_heads self.pruned_heads = self.pruned_heads.union(heads) def forward(self, query, key, value, mask, head_mask=None, output_attentions=False): """ Parameters: query: torch.tensor(bs, seq_length, dim) key: torch.tensor(bs, seq_length, dim) value: torch.tensor(bs, seq_length, dim) mask: torch.tensor(bs, seq_length) Returns: weights: torch.tensor(bs, n_heads, seq_length, seq_length) Attention weights context: torch.tensor(bs, seq_length, dim) Contextualized layer. Optional: only if `output_attentions=True` """ bs, q_length, dim = query.size() k_length = key.size(1) # assert dim == self.dim, f'Dimensions do not match: {dim} input vs {self.dim} configured' # assert key.size() == value.size() dim_per_head = self.dim // self.n_heads mask_reshp = (bs, 1, 1, k_length) def shape(x): """ separate heads """ return x.view(bs, -1, self.n_heads, dim_per_head).transpose(1, 2) def unshape(x): """ group heads """ return x.transpose(1, 2).contiguous().view(bs, -1, self.n_heads * dim_per_head) q = shape(self.q_lin(query)) # (bs, n_heads, q_length, dim_per_head) k = shape(self.k_lin(key)) # (bs, n_heads, k_length, dim_per_head) v = shape(self.v_lin(value)) # (bs, n_heads, k_length, dim_per_head) q = q / math.sqrt(dim_per_head) # (bs, n_heads, q_length, dim_per_head) scores = torch.matmul(q, k.transpose(2, 3)) # (bs, n_heads, q_length, k_length) mask = (mask == 0).view(mask_reshp).expand_as(scores) # (bs, n_heads, q_length, k_length) scores.masked_fill_(mask, -float("inf")) # (bs, n_heads, q_length, k_length) weights = nn.Softmax(dim=-1)(scores) # (bs, n_heads, q_length, k_length) weights = self.dropout(weights) # (bs, n_heads, q_length, k_length) # Mask heads if we want to if head_mask is not None: weights = weights * head_mask context = torch.matmul(weights, v) # (bs, n_heads, q_length, dim_per_head) context = unshape(context) # (bs, q_length, dim) context = self.out_lin(context) # (bs, q_length, dim) if output_attentions: return (context, weights) else: return (context,) class FFN(nn.Module): def __init__(self, config): super().__init__() self.dropout = nn.Dropout(p=config.dropout) self.chunk_size_feed_forward = config.chunk_size_feed_forward self.seq_len_dim = 1 self.lin1 = nn.Linear(in_features=config.dim, out_features=config.hidden_dim) self.lin2 = nn.Linear(in_features=config.hidden_dim, out_features=config.dim) assert config.activation in ["relu", "gelu"], f"activation ({config.activation}) must be in ['relu', 'gelu']" self.activation = gelu if config.activation == "gelu" else nn.ReLU() def forward(self, input): return apply_chunking_to_forward(self.ff_chunk, self.chunk_size_feed_forward, self.seq_len_dim, input) def ff_chunk(self, input): x = self.lin1(input) x = self.activation(x) x = self.lin2(x) x = self.dropout(x) return x class TransformerBlock(nn.Module): def __init__(self, config): super().__init__() assert config.dim % config.n_heads == 0 self.attention = MultiHeadSelfAttention(config) self.sa_layer_norm = nn.LayerNorm(normalized_shape=config.dim, eps=1e-12) self.ffn = FFN(config) self.output_layer_norm = nn.LayerNorm(normalized_shape=config.dim, eps=1e-12) def forward(self, x, attn_mask=None, head_mask=None, output_attentions=False): """ Parameters: x: torch.tensor(bs, seq_length, dim) attn_mask: torch.tensor(bs, seq_length) Returns: sa_weights: torch.tensor(bs, n_heads, seq_length, seq_length) The attention weights ffn_output: torch.tensor(bs, seq_length, dim) The output of the transformer block contextualization. """ # Self-Attention sa_output = self.attention( query=x, key=x, value=x, mask=attn_mask, head_mask=head_mask, output_attentions=output_attentions, ) if output_attentions: sa_output, sa_weights = sa_output # (bs, seq_length, dim), (bs, n_heads, seq_length, seq_length) else: # To handle these `output_attentions` or `output_hidden_states` cases returning tuples assert type(sa_output) == tuple sa_output = sa_output[0] sa_output = self.sa_layer_norm(sa_output + x) # (bs, seq_length, dim) # Feed Forward Network ffn_output = self.ffn(sa_output) # (bs, seq_length, dim) ffn_output = self.output_layer_norm(ffn_output + sa_output) # (bs, seq_length, dim) output = (ffn_output,) if output_attentions: output = (sa_weights,) + output return output class Transformer(nn.Module): def __init__(self, config): super().__init__() self.n_layers = config.n_layers layer = TransformerBlock(config) self.layer = nn.ModuleList([copy.deepcopy(layer) for _ in range(config.n_layers)]) def forward( self, x, attn_mask=None, head_mask=None, output_attentions=False, output_hidden_states=False, return_dict=None ): # docstyle-ignore """ Parameters: x: torch.tensor(bs, seq_length, dim) Input sequence embedded. attn_mask: torch.tensor(bs, seq_length) Attention mask on the sequence. Returns: hidden_state: torch.tensor(bs, seq_length, dim) Sequence of hidden states in the last (top) layer all_hidden_states: Tuple[torch.tensor(bs, seq_length, dim)] Tuple of length n_layers with the hidden states from each layer. Optional: only if output_hidden_states=True all_attentions: Tuple[torch.tensor(bs, n_heads, seq_length, seq_length)] Tuple of length n_layers with the attention weights from each layer Optional: only if output_attentions=True """ all_hidden_states = () if output_hidden_states else None all_attentions = () if output_attentions else None hidden_state = x for i, layer_module in enumerate(self.layer): if output_hidden_states: all_hidden_states = all_hidden_states + (hidden_state,) layer_outputs = layer_module( x=hidden_state, attn_mask=attn_mask, head_mask=head_mask[i], output_attentions=output_attentions ) hidden_state = layer_outputs[-1] if output_attentions: assert len(layer_outputs) == 2 attentions = layer_outputs[0] all_attentions = all_attentions + (attentions,) else: assert len(layer_outputs) == 1 # Add last layer if output_hidden_states: all_hidden_states = all_hidden_states + (hidden_state,) if not return_dict: return tuple(v for v in [hidden_state, all_hidden_states, all_attentions] if v is not None) return BaseModelOutput( last_hidden_state=hidden_state, hidden_states=all_hidden_states, attentions=all_attentions ) # INTERFACE FOR ENCODER AND TASK SPECIFIC MODEL # class DistilBertPreTrainedModel(PreTrainedModel): """ An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained models. """ config_class = DistilBertConfig load_tf_weights = None base_model_prefix = "distilbert" def _init_weights(self, module): """Initialize the weights.""" if isinstance(module, nn.Linear): # Slightly different from the TF version which uses truncated_normal for initialization # cf https://github.com/pytorch/pytorch/pull/5617 module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) if module.bias is not None: module.bias.data.zero_() elif isinstance(module, nn.Embedding): module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) if module.padding_idx is not None: module.weight.data[module.padding_idx].zero_() elif isinstance(module, nn.LayerNorm): module.bias.data.zero_() module.weight.data.fill_(1.0) DISTILBERT_START_DOCSTRING = r""" This model inherits from :class:`~transformers.PreTrainedModel`. Check the superclass documentation for the generic methods the library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads etc.) This model is also a PyTorch `torch.nn.Module `__ subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and behavior. Parameters: config (:class:`~transformers.DistilBertConfig`): Model configuration class with all the parameters of the model. Initializing with a config file does not load the weights associated with the model, only the configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights. """ DISTILBERT_INPUTS_DOCSTRING = r""" Args: input_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`): Indices of input sequence tokens in the vocabulary. Indices can be obtained using :class:`~transformers.DistilBertTokenizer`. See :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for details. `What are input IDs? <../glossary.html#input-ids>`__ attention_mask (:obj:`torch.FloatTensor` of shape :obj:`({0})`, `optional`): Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``: - 1 for tokens that are **not masked**, - 0 for tokens that are **masked**. `What are attention masks? <../glossary.html#attention-mask>`__ head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`): Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``: - 1 indicates the head is **not masked**, - 0 indicates the head is **masked**. inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`({0}, hidden_size)`, `optional`): Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation. This is useful if you want more control over how to convert :obj:`input_ids` indices into associated vectors than the model's internal embedding lookup matrix. output_attentions (:obj:`bool`, `optional`): Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned tensors for more detail. output_hidden_states (:obj:`bool`, `optional`): Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for more detail. return_dict (:obj:`bool`, `optional`): Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple. """ @add_start_docstrings( "The bare DistilBERT encoder/transformer outputting raw hidden-states without any specific head on top.", DISTILBERT_START_DOCSTRING, ) class DistilBertModel(DistilBertPreTrainedModel): def __init__(self, config): super().__init__(config) self.embeddings = Embeddings(config) # Embeddings self.transformer = Transformer(config) # Encoder self.init_weights() def get_input_embeddings(self): return self.embeddings.word_embeddings def set_input_embeddings(self, new_embeddings): self.embeddings.word_embeddings = new_embeddings def _prune_heads(self, heads_to_prune): """ Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base class PreTrainedModel """ for layer, heads in heads_to_prune.items(): self.transformer.layer[layer].attention.prune_heads(heads) @add_start_docstrings_to_model_forward(DISTILBERT_INPUTS_DOCSTRING.format("batch_size, num_choices")) @add_code_sample_docstrings( tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint=_CHECKPOINT_FOR_DOC, output_type=BaseModelOutput, config_class=_CONFIG_FOR_DOC, ) def forward( self, input_ids=None, attention_mask=None, head_mask=None, inputs_embeds=None, output_attentions=None, output_hidden_states=None, return_dict=None, ): output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) return_dict = return_dict if return_dict is not None else self.config.use_return_dict if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") elif input_ids is not None: input_shape = input_ids.size() elif inputs_embeds is not None: input_shape = inputs_embeds.size()[:-1] else: raise ValueError("You have to specify either input_ids or inputs_embeds") device = input_ids.device if input_ids is not None else inputs_embeds.device if attention_mask is None: attention_mask = torch.ones(input_shape, device=device) # (bs, seq_length) # Prepare head mask if needed head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers) if inputs_embeds is None: inputs_embeds = self.embeddings(input_ids) # (bs, seq_length, dim) return self.transformer( x=inputs_embeds, attn_mask=attention_mask, head_mask=head_mask, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict, ) @add_start_docstrings( """DistilBert Model with a `masked language modeling` head on top. """, DISTILBERT_START_DOCSTRING, ) class DistilBertForMaskedLM(DistilBertPreTrainedModel): def __init__(self, config): super().__init__(config) self.distilbert = DistilBertModel(config) self.vocab_transform = nn.Linear(config.dim, config.dim) self.vocab_layer_norm = nn.LayerNorm(config.dim, eps=1e-12) self.vocab_projector = nn.Linear(config.dim, config.vocab_size) self.init_weights() self.mlm_loss_fct = nn.CrossEntropyLoss() def get_output_embeddings(self): return self.vocab_projector def set_output_embeddings(self, new_embeddings): self.vocab_projector = new_embeddings @add_start_docstrings_to_model_forward(DISTILBERT_INPUTS_DOCSTRING.format("batch_size, num_choices")) @add_code_sample_docstrings( tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint=_CHECKPOINT_FOR_DOC, output_type=MaskedLMOutput, config_class=_CONFIG_FOR_DOC, ) def forward( self, input_ids=None, attention_mask=None, head_mask=None, inputs_embeds=None, labels=None, output_attentions=None, output_hidden_states=None, return_dict=None, ): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`): Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``. """ return_dict = return_dict if return_dict is not None else self.config.use_return_dict dlbrt_output = self.distilbert( input_ids=input_ids, attention_mask=attention_mask, head_mask=head_mask, inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict, ) hidden_states = dlbrt_output[0] # (bs, seq_length, dim) prediction_logits = self.vocab_transform(hidden_states) # (bs, seq_length, dim) prediction_logits = gelu(prediction_logits) # (bs, seq_length, dim) prediction_logits = self.vocab_layer_norm(prediction_logits) # (bs, seq_length, dim) prediction_logits = self.vocab_projector(prediction_logits) # (bs, seq_length, vocab_size) mlm_loss = None if labels is not None: mlm_loss = self.mlm_loss_fct(prediction_logits.view(-1, prediction_logits.size(-1)), labels.view(-1)) if not return_dict: output = (prediction_logits,) + dlbrt_output[1:] return ((mlm_loss,) + output) if mlm_loss is not None else output return MaskedLMOutput( loss=mlm_loss, logits=prediction_logits, hidden_states=dlbrt_output.hidden_states, attentions=dlbrt_output.attentions, ) @add_start_docstrings( """ DistilBert Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled output) e.g. for GLUE tasks. """, DISTILBERT_START_DOCSTRING, ) class DistilBertForSequenceClassification(DistilBertPreTrainedModel): def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels self.distilbert = DistilBertModel(config) self.pre_classifier = nn.Linear(config.dim, config.dim) self.classifier = nn.Linear(config.dim, config.num_labels) self.dropout = nn.Dropout(config.seq_classif_dropout) self.init_weights() @add_start_docstrings_to_model_forward(DISTILBERT_INPUTS_DOCSTRING.format("batch_size, num_choices")) @add_code_sample_docstrings( tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint=_CHECKPOINT_FOR_DOC, output_type=SequenceClassifierOutput, config_class=_CONFIG_FOR_DOC, ) def forward( self, input_ids=None, attention_mask=None, head_mask=None, inputs_embeds=None, labels=None, output_attentions=None, output_hidden_states=None, return_dict=None, ): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`): Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ..., config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss), If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ return_dict = return_dict if return_dict is not None else self.config.use_return_dict distilbert_output = self.distilbert( input_ids=input_ids, attention_mask=attention_mask, head_mask=head_mask, inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict, ) hidden_state = distilbert_output[0] # (bs, seq_len, dim) pooled_output = hidden_state[:, 0] # (bs, dim) pooled_output = self.pre_classifier(pooled_output) # (bs, dim) pooled_output = nn.ReLU()(pooled_output) # (bs, dim) pooled_output = self.dropout(pooled_output) # (bs, dim) logits = self.classifier(pooled_output) # (bs, num_labels) loss = None if labels is not None: if self.num_labels == 1: loss_fct = nn.MSELoss() loss = loss_fct(logits.view(-1), labels.view(-1)) else: loss_fct = nn.CrossEntropyLoss() loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) if not return_dict: output = (logits,) + distilbert_output[1:] return ((loss,) + output) if loss is not None else output return SequenceClassifierOutput( loss=loss, logits=logits, hidden_states=distilbert_output.hidden_states, attentions=distilbert_output.attentions, ) @add_start_docstrings( """ DistilBert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of the hidden-states output to compute `span start logits` and `span end logits`). """, DISTILBERT_START_DOCSTRING, ) class DistilBertForQuestionAnswering(DistilBertPreTrainedModel): def __init__(self, config): super().__init__(config) self.distilbert = DistilBertModel(config) self.qa_outputs = nn.Linear(config.dim, config.num_labels) assert config.num_labels == 2 self.dropout = nn.Dropout(config.qa_dropout) self.init_weights() @add_start_docstrings_to_model_forward(DISTILBERT_INPUTS_DOCSTRING.format("batch_size, num_choices")) @add_code_sample_docstrings( tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint=_CHECKPOINT_FOR_DOC, output_type=QuestionAnsweringModelOutput, config_class=_CONFIG_FOR_DOC, ) def forward( self, input_ids=None, attention_mask=None, head_mask=None, inputs_embeds=None, start_positions=None, end_positions=None, output_attentions=None, output_hidden_states=None, return_dict=None, ): r""" start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`): Labels for position (index) of the start of the labelled span for computing the token classification loss. Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the sequence are not taken into account for computing the loss. end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`): Labels for position (index) of the end of the labelled span for computing the token classification loss. Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the sequence are not taken into account for computing the loss. """ return_dict = return_dict if return_dict is not None else self.config.use_return_dict distilbert_output = self.distilbert( input_ids=input_ids, attention_mask=attention_mask, head_mask=head_mask, inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict, ) hidden_states = distilbert_output[0] # (bs, max_query_len, dim) hidden_states = self.dropout(hidden_states) # (bs, max_query_len, dim) logits = self.qa_outputs(hidden_states) # (bs, max_query_len, 2) start_logits, end_logits = logits.split(1, dim=-1) start_logits = start_logits.squeeze(-1) # (bs, max_query_len) end_logits = end_logits.squeeze(-1) # (bs, max_query_len) total_loss = None if start_positions is not None and end_positions is not None: # If we are on multi-GPU, split add a dimension if len(start_positions.size()) > 1: start_positions = start_positions.squeeze(-1) if len(end_positions.size()) > 1: end_positions = end_positions.squeeze(-1) # sometimes the start/end positions are outside our model inputs, we ignore these terms ignored_index = start_logits.size(1) start_positions.clamp_(0, ignored_index) end_positions.clamp_(0, ignored_index) loss_fct = nn.CrossEntropyLoss(ignore_index=ignored_index) start_loss = loss_fct(start_logits, start_positions) end_loss = loss_fct(end_logits, end_positions) total_loss = (start_loss + end_loss) / 2 if not return_dict: output = (start_logits, end_logits) + distilbert_output[1:] return ((total_loss,) + output) if total_loss is not None else output return QuestionAnsweringModelOutput( loss=total_loss, start_logits=start_logits, end_logits=end_logits, hidden_states=distilbert_output.hidden_states, attentions=distilbert_output.attentions, ) @add_start_docstrings( """ DistilBert Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """, DISTILBERT_START_DOCSTRING, ) class DistilBertForTokenClassification(DistilBertPreTrainedModel): def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels self.distilbert = DistilBertModel(config) self.dropout = nn.Dropout(config.dropout) self.classifier = nn.Linear(config.hidden_size, config.num_labels) self.init_weights() @add_start_docstrings_to_model_forward(DISTILBERT_INPUTS_DOCSTRING) @add_code_sample_docstrings( tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint=_CHECKPOINT_FOR_DOC, output_type=TokenClassifierOutput, config_class=_CONFIG_FOR_DOC, ) def forward( self, input_ids=None, attention_mask=None, head_mask=None, inputs_embeds=None, labels=None, output_attentions=None, output_hidden_states=None, return_dict=None, ): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`): Labels for computing the token classification loss. Indices should be in ``[0, ..., config.num_labels - 1]``. """ return_dict = return_dict if return_dict is not None else self.config.use_return_dict outputs = self.distilbert( input_ids, attention_mask=attention_mask, head_mask=head_mask, inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict, ) sequence_output = outputs[0] sequence_output = self.dropout(sequence_output) logits = self.classifier(sequence_output) loss = None if labels is not None: loss_fct = CrossEntropyLoss() # Only keep active parts of the loss if attention_mask is not None: active_loss = attention_mask.view(-1) == 1 active_logits = logits.view(-1, self.num_labels) active_labels = torch.where( active_loss, labels.view(-1), torch.tensor(loss_fct.ignore_index).type_as(labels) ) loss = loss_fct(active_logits, active_labels) else: loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) if not return_dict: output = (logits,) + outputs[1:] return ((loss,) + output) if loss is not None else output return TokenClassifierOutput( loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions, ) @add_start_docstrings( """ DistilBert Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a softmax) e.g. for RocStories/SWAG tasks. """, DISTILBERT_START_DOCSTRING, ) class DistilBertForMultipleChoice(DistilBertPreTrainedModel): def __init__(self, config): super().__init__(config) self.distilbert = DistilBertModel(config) self.pre_classifier = nn.Linear(config.dim, config.dim) self.classifier = nn.Linear(config.dim, 1) self.dropout = nn.Dropout(config.seq_classif_dropout) self.init_weights() @add_start_docstrings_to_model_forward( DISTILBERT_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length") ) @replace_return_docstrings(output_type=MultipleChoiceModelOutput, config_class=_CONFIG_FOR_DOC) def forward( self, input_ids=None, attention_mask=None, head_mask=None, inputs_embeds=None, labels=None, output_attentions=None, output_hidden_states=None, return_dict=None, ): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`): Labels for computing the multiple choice classification loss. Indices should be in ``[0, ..., num_choices-1]`` where :obj:`num_choices` is the size of the second dimension of the input tensors. (See :obj:`input_ids` above) Returns: Examples:: >>> from transformers import DistilBertTokenizer, DistilBertForMultipleChoice >>> import torch >>> tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased') >>> model = DistilBertForMultipleChoice.from_pretrained('distilbert-base-cased') >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced." >>> choice0 = "It is eaten with a fork and a knife." >>> choice1 = "It is eaten while held in the hand." >>> labels = torch.tensor(0).unsqueeze(0) # choice0 is correct (according to Wikipedia ;)), batch size 1 >>> encoding = tokenizer([[prompt, choice0], [prompt, choice1]], return_tensors='pt', padding=True) >>> outputs = model(**{k: v.unsqueeze(0) for k,v in encoding.items()}, labels=labels) # batch size is 1 >>> # the linear classifier still needs to be trained >>> loss = outputs.loss >>> logits = outputs.logits """ return_dict = return_dict if return_dict is not None else self.config.use_return_dict num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1] input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None inputs_embeds = ( inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1)) if inputs_embeds is not None else None ) outputs = self.distilbert( input_ids, attention_mask=attention_mask, head_mask=head_mask, inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict, ) hidden_state = outputs[0] # (bs * num_choices, seq_len, dim) pooled_output = hidden_state[:, 0] # (bs * num_choices, dim) pooled_output = self.pre_classifier(pooled_output) # (bs * num_choices, dim) pooled_output = nn.ReLU()(pooled_output) # (bs * num_choices, dim) pooled_output = self.dropout(pooled_output) # (bs * num_choices, dim) logits = self.classifier(pooled_output) # (bs * num_choices, 1) reshaped_logits = logits.view(-1, num_choices) # (bs, num_choices) loss = None if labels is not None: loss_fct = CrossEntropyLoss() loss = loss_fct(reshaped_logits, labels) if not return_dict: output = (reshaped_logits,) + outputs[1:] return ((loss,) + output) if loss is not None else output return MultipleChoiceModelOutput( loss=loss, logits=reshaped_logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions, ) ================================================ FILE: patches/transformers/4.5.1/modeling_distilbert.py ================================================ # coding=utf-8 # Copyright 2019-present, the HuggingFace Inc. team, The Google AI Language Team and Facebook, Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ PyTorch DistilBERT model adapted in part from Facebook, Inc XLM model (https://github.com/facebookresearch/XLM) and in part from HuggingFace PyTorch version of Google AI Bert model (https://github.com/google-research/bert) """ import copy import math import numpy as np import torch import torch.nn as nn from torch.nn import CrossEntropyLoss from ...activations import gelu from ...file_utils import ( add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, replace_return_docstrings, ) from ...modeling_outputs import ( BaseModelOutput, MaskedLMOutput, MultipleChoiceModelOutput, QuestionAnsweringModelOutput, SequenceClassifierOutput, TokenClassifierOutput, ) from ...modeling_utils import ( PreTrainedModel, apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer, ) from ...utils import logging from .configuration_distilbert import DistilBertConfig logger = logging.get_logger(__name__) _CHECKPOINT_FOR_DOC = "distilbert-base-uncased" _CONFIG_FOR_DOC = "DistilBertConfig" _TOKENIZER_FOR_DOC = "DistilBertTokenizer" DISTILBERT_PRETRAINED_MODEL_ARCHIVE_LIST = [ "distilbert-base-uncased", "distilbert-base-uncased-distilled-squad", "distilbert-base-cased", "distilbert-base-cased-distilled-squad", "distilbert-base-german-cased", "distilbert-base-multilingual-cased", "distilbert-base-uncased-finetuned-sst-2-english", # See all DistilBERT models at https://huggingface.co/models?filter=distilbert ] # UTILS AND BUILDING BLOCKS OF THE ARCHITECTURE # def create_sinusoidal_embeddings(n_pos, dim, out): position_enc = np.array([[pos / np.power(10000, 2 * (j // 2) / dim) for j in range(dim)] for pos in range(n_pos)]) out.requires_grad = False out[:, 0::2] = torch.FloatTensor(np.sin(position_enc[:, 0::2])) out[:, 1::2] = torch.FloatTensor(np.cos(position_enc[:, 1::2])) out.detach_() class Embeddings(nn.Module): def __init__(self, config): super().__init__() self.word_embeddings = nn.Embedding(config.vocab_size, config.dim, padding_idx=config.pad_token_id) self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.dim) if config.sinusoidal_pos_embds: create_sinusoidal_embeddings( n_pos=config.max_position_embeddings, dim=config.dim, out=self.position_embeddings.weight ) self.LayerNorm = nn.LayerNorm(config.dim, eps=1e-12) self.dropout = nn.Dropout(config.dropout) def forward(self, input_ids): """ Parameters: input_ids: torch.tensor(bs, max_seq_length) The token ids to embed. Returns: torch.tensor(bs, max_seq_length, dim) The embedded tokens (plus position embeddings, no token_type embeddings) """ seq_length = input_ids.size(1) position_ids = torch.arange(seq_length, dtype=torch.long, device=input_ids.device) # (max_seq_length) position_ids = position_ids.unsqueeze(0).expand_as(input_ids) # (bs, max_seq_length) word_embeddings = self.word_embeddings(input_ids) # (bs, max_seq_length, dim) position_embeddings = self.position_embeddings(position_ids) # (bs, max_seq_length, dim) embeddings = word_embeddings + position_embeddings # (bs, max_seq_length, dim) embeddings = self.LayerNorm(embeddings) # (bs, max_seq_length, dim) embeddings = self.dropout(embeddings) # (bs, max_seq_length, dim) return embeddings class MultiHeadSelfAttention(nn.Module): def __init__(self, config): super().__init__() self.n_heads = config.n_heads self.dim = config.dim self.dropout = nn.Dropout(p=config.attention_dropout) assert self.dim % self.n_heads == 0 self.q_lin = nn.Linear(in_features=config.dim, out_features=config.dim) self.k_lin = nn.Linear(in_features=config.dim, out_features=config.dim) self.v_lin = nn.Linear(in_features=config.dim, out_features=config.dim) self.out_lin = nn.Linear(in_features=config.dim, out_features=config.dim) self.pruned_heads = set() def prune_heads(self, heads): attention_head_size = self.dim // self.n_heads if len(heads) == 0: return heads, index = find_pruneable_heads_and_indices(heads, self.n_heads, attention_head_size, self.pruned_heads) # Prune linear layers self.q_lin = prune_linear_layer(self.q_lin, index) self.k_lin = prune_linear_layer(self.k_lin, index) self.v_lin = prune_linear_layer(self.v_lin, index) self.out_lin = prune_linear_layer(self.out_lin, index, dim=1) # Update hyper params self.n_heads = self.n_heads - len(heads) self.dim = attention_head_size * self.n_heads self.pruned_heads = self.pruned_heads.union(heads) def forward(self, query, key, value, mask, head_mask=None, output_attentions=False): """ Parameters: query: torch.tensor(bs, seq_length, dim) key: torch.tensor(bs, seq_length, dim) value: torch.tensor(bs, seq_length, dim) mask: torch.tensor(bs, seq_length) Returns: weights: torch.tensor(bs, n_heads, seq_length, seq_length) Attention weights context: torch.tensor(bs, seq_length, dim) Contextualized layer. Optional: only if `output_attentions=True` """ bs, q_length, dim = query.size() k_length = key.size(1) # assert dim == self.dim, f'Dimensions do not match: {dim} input vs {self.dim} configured' # assert key.size() == value.size() dim_per_head = self.dim // self.n_heads mask_reshp = (bs, 1, 1, k_length) def shape(x): """ separate heads """ return x.view(bs, -1, self.n_heads, dim_per_head).transpose(1, 2) def unshape(x): """ group heads """ return x.transpose(1, 2).contiguous().view(bs, -1, self.n_heads * dim_per_head) q = shape(self.q_lin(query)) # (bs, n_heads, q_length, dim_per_head) k = shape(self.k_lin(key)) # (bs, n_heads, k_length, dim_per_head) v = shape(self.v_lin(value)) # (bs, n_heads, k_length, dim_per_head) q = q / math.sqrt(dim_per_head) # (bs, n_heads, q_length, dim_per_head) scores = torch.matmul(q, k.transpose(2, 3)) # (bs, n_heads, q_length, k_length) mask = mask.view(mask_reshp).expand_as(scores) # (bs, n_heads, q_length, k_length) scores.masked_fill_((mask == 0), -float("inf")) # (bs, n_heads, q_length, k_length) weights = nn.Softmax(dim=-1)(scores) # (bs, n_heads, q_length, k_length) weights = self.dropout(weights) # (bs, n_heads, q_length, k_length) # Mask heads if we want to if head_mask is not None: weights = weights * head_mask context = torch.matmul(weights, v) # (bs, n_heads, q_length, dim_per_head) context = unshape(context) # (bs, q_length, dim) context = self.out_lin(context) # (bs, q_length, dim) if output_attentions: return (context, weights) else: return (context,) class FFN(nn.Module): def __init__(self, config): super().__init__() self.dropout = nn.Dropout(p=config.dropout) self.chunk_size_feed_forward = config.chunk_size_feed_forward self.seq_len_dim = 1 self.lin1 = nn.Linear(in_features=config.dim, out_features=config.hidden_dim) self.lin2 = nn.Linear(in_features=config.hidden_dim, out_features=config.dim) assert config.activation in ["relu", "gelu"], f"activation ({config.activation}) must be in ['relu', 'gelu']" self.activation = gelu if config.activation == "gelu" else nn.ReLU() def forward(self, input): return apply_chunking_to_forward(self.ff_chunk, self.chunk_size_feed_forward, self.seq_len_dim, input) def ff_chunk(self, input): x = self.lin1(input) x = self.activation(x) x = self.lin2(x) x = self.dropout(x) return x class TransformerBlock(nn.Module): def __init__(self, config): super().__init__() assert config.dim % config.n_heads == 0 self.attention = MultiHeadSelfAttention(config) self.sa_layer_norm = nn.LayerNorm(normalized_shape=config.dim, eps=1e-12) self.ffn = FFN(config) self.output_layer_norm = nn.LayerNorm(normalized_shape=config.dim, eps=1e-12) def forward(self, x, attn_mask=None, head_mask=None, output_attentions=False): """ Parameters: x: torch.tensor(bs, seq_length, dim) attn_mask: torch.tensor(bs, seq_length) Returns: sa_weights: torch.tensor(bs, n_heads, seq_length, seq_length) The attention weights ffn_output: torch.tensor(bs, seq_length, dim) The output of the transformer block contextualization. """ # Self-Attention sa_output = self.attention( query=x, key=x, value=x, mask=attn_mask, head_mask=head_mask, output_attentions=output_attentions, ) if output_attentions: sa_output, sa_weights = sa_output # (bs, seq_length, dim), (bs, n_heads, seq_length, seq_length) else: # To handle these `output_attentions` or `output_hidden_states` cases returning tuples assert type(sa_output) == tuple sa_output = sa_output[0] sa_output = self.sa_layer_norm(sa_output + x) # (bs, seq_length, dim) # Feed Forward Network ffn_output = self.ffn(sa_output) # (bs, seq_length, dim) ffn_output = self.output_layer_norm(ffn_output + sa_output) # (bs, seq_length, dim) output = (ffn_output,) if output_attentions: output = (sa_weights,) + output return output class Transformer(nn.Module): def __init__(self, config): super().__init__() self.n_layers = config.n_layers layer = TransformerBlock(config) self.layer = nn.ModuleList([copy.deepcopy(layer) for _ in range(config.n_layers)]) def forward( self, x, attn_mask=None, head_mask=None, output_attentions=False, output_hidden_states=False, return_dict=None ): # docstyle-ignore """ Parameters: x: torch.tensor(bs, seq_length, dim) Input sequence embedded. attn_mask: torch.tensor(bs, seq_length) Attention mask on the sequence. Returns: hidden_state: torch.tensor(bs, seq_length, dim) Sequence of hidden states in the last (top) layer all_hidden_states: Tuple[torch.tensor(bs, seq_length, dim)] Tuple of length n_layers with the hidden states from each layer. Optional: only if output_hidden_states=True all_attentions: Tuple[torch.tensor(bs, n_heads, seq_length, seq_length)] Tuple of length n_layers with the attention weights from each layer Optional: only if output_attentions=True """ all_hidden_states = () if output_hidden_states else None all_attentions = () if output_attentions else None hidden_state = x for i, layer_module in enumerate(self.layer): if output_hidden_states: all_hidden_states = all_hidden_states + (hidden_state,) layer_outputs = layer_module( x=hidden_state, attn_mask=attn_mask, head_mask=head_mask[i], output_attentions=output_attentions ) hidden_state = layer_outputs[-1] if output_attentions: assert len(layer_outputs) == 2 attentions = layer_outputs[0] all_attentions = all_attentions + (attentions,) else: assert len(layer_outputs) == 1 # Add last layer if output_hidden_states: all_hidden_states = all_hidden_states + (hidden_state,) if not return_dict: return tuple(v for v in [hidden_state, all_hidden_states, all_attentions] if v is not None) return BaseModelOutput( last_hidden_state=hidden_state, hidden_states=all_hidden_states, attentions=all_attentions ) # INTERFACE FOR ENCODER AND TASK SPECIFIC MODEL # class DistilBertPreTrainedModel(PreTrainedModel): """ An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained models. """ config_class = DistilBertConfig load_tf_weights = None base_model_prefix = "distilbert" def _init_weights(self, module): """Initialize the weights.""" if isinstance(module, nn.Linear): # Slightly different from the TF version which uses truncated_normal for initialization # cf https://github.com/pytorch/pytorch/pull/5617 module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) if module.bias is not None: module.bias.data.zero_() elif isinstance(module, nn.Embedding): module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) if module.padding_idx is not None: module.weight.data[module.padding_idx].zero_() elif isinstance(module, nn.LayerNorm): module.bias.data.zero_() module.weight.data.fill_(1.0) DISTILBERT_START_DOCSTRING = r""" This model inherits from :class:`~transformers.PreTrainedModel`. Check the superclass documentation for the generic methods the library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads etc.) This model is also a PyTorch `torch.nn.Module `__ subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and behavior. Parameters: config (:class:`~transformers.DistilBertConfig`): Model configuration class with all the parameters of the model. Initializing with a config file does not load the weights associated with the model, only the configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights. """ DISTILBERT_INPUTS_DOCSTRING = r""" Args: input_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`): Indices of input sequence tokens in the vocabulary. Indices can be obtained using :class:`~transformers.DistilBertTokenizer`. See :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for details. `What are input IDs? <../glossary.html#input-ids>`__ attention_mask (:obj:`torch.FloatTensor` of shape :obj:`({0})`, `optional`): Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``: - 1 for tokens that are **not masked**, - 0 for tokens that are **masked**. `What are attention masks? <../glossary.html#attention-mask>`__ head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`): Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``: - 1 indicates the head is **not masked**, - 0 indicates the head is **masked**. inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`({0}, hidden_size)`, `optional`): Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation. This is useful if you want more control over how to convert :obj:`input_ids` indices into associated vectors than the model's internal embedding lookup matrix. output_attentions (:obj:`bool`, `optional`): Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned tensors for more detail. output_hidden_states (:obj:`bool`, `optional`): Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for more detail. return_dict (:obj:`bool`, `optional`): Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple. """ @add_start_docstrings( "The bare DistilBERT encoder/transformer outputting raw hidden-states without any specific head on top.", DISTILBERT_START_DOCSTRING, ) class DistilBertModel(DistilBertPreTrainedModel): def __init__(self, config): super().__init__(config) self.embeddings = Embeddings(config) # Embeddings self.transformer = Transformer(config) # Encoder self.init_weights() def get_input_embeddings(self): return self.embeddings.word_embeddings def set_input_embeddings(self, new_embeddings): self.embeddings.word_embeddings = new_embeddings def _prune_heads(self, heads_to_prune): """ Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base class PreTrainedModel """ for layer, heads in heads_to_prune.items(): self.transformer.layer[layer].attention.prune_heads(heads) @add_start_docstrings_to_model_forward(DISTILBERT_INPUTS_DOCSTRING.format("batch_size, num_choices")) @add_code_sample_docstrings( tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint=_CHECKPOINT_FOR_DOC, output_type=BaseModelOutput, config_class=_CONFIG_FOR_DOC, ) def forward( self, input_ids=None, attention_mask=None, head_mask=None, inputs_embeds=None, output_attentions=None, output_hidden_states=None, return_dict=None, ): output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) return_dict = return_dict if return_dict is not None else self.config.use_return_dict if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") elif input_ids is not None: input_shape = input_ids.size() elif inputs_embeds is not None: input_shape = inputs_embeds.size()[:-1] else: raise ValueError("You have to specify either input_ids or inputs_embeds") device = input_ids.device if input_ids is not None else inputs_embeds.device if attention_mask is None: attention_mask = torch.ones(input_shape, device=device) # (bs, seq_length) # Prepare head mask if needed head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers) if inputs_embeds is None: inputs_embeds = self.embeddings(input_ids) # (bs, seq_length, dim) return self.transformer( x=inputs_embeds, attn_mask=attention_mask, head_mask=head_mask, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict, ) @add_start_docstrings( """DistilBert Model with a `masked language modeling` head on top. """, DISTILBERT_START_DOCSTRING, ) class DistilBertForMaskedLM(DistilBertPreTrainedModel): def __init__(self, config): super().__init__(config) self.distilbert = DistilBertModel(config) self.vocab_transform = nn.Linear(config.dim, config.dim) self.vocab_layer_norm = nn.LayerNorm(config.dim, eps=1e-12) self.vocab_projector = nn.Linear(config.dim, config.vocab_size) self.init_weights() self.mlm_loss_fct = nn.CrossEntropyLoss() def get_output_embeddings(self): return self.vocab_projector def set_output_embeddings(self, new_embeddings): self.vocab_projector = new_embeddings @add_start_docstrings_to_model_forward(DISTILBERT_INPUTS_DOCSTRING.format("batch_size, num_choices")) @add_code_sample_docstrings( tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint=_CHECKPOINT_FOR_DOC, output_type=MaskedLMOutput, config_class=_CONFIG_FOR_DOC, ) def forward( self, input_ids=None, attention_mask=None, head_mask=None, inputs_embeds=None, labels=None, output_attentions=None, output_hidden_states=None, return_dict=None, ): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`): Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``. """ return_dict = return_dict if return_dict is not None else self.config.use_return_dict dlbrt_output = self.distilbert( input_ids=input_ids, attention_mask=attention_mask, head_mask=head_mask, inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict, ) hidden_states = dlbrt_output[0] # (bs, seq_length, dim) prediction_logits = self.vocab_transform(hidden_states) # (bs, seq_length, dim) prediction_logits = gelu(prediction_logits) # (bs, seq_length, dim) prediction_logits = self.vocab_layer_norm(prediction_logits) # (bs, seq_length, dim) prediction_logits = self.vocab_projector(prediction_logits) # (bs, seq_length, vocab_size) mlm_loss = None if labels is not None: mlm_loss = self.mlm_loss_fct(prediction_logits.view(-1, prediction_logits.size(-1)), labels.view(-1)) if not return_dict: output = (prediction_logits,) + dlbrt_output[1:] return ((mlm_loss,) + output) if mlm_loss is not None else output return MaskedLMOutput( loss=mlm_loss, logits=prediction_logits, hidden_states=dlbrt_output.hidden_states, attentions=dlbrt_output.attentions, ) @add_start_docstrings( """ DistilBert Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled output) e.g. for GLUE tasks. """, DISTILBERT_START_DOCSTRING, ) class DistilBertForSequenceClassification(DistilBertPreTrainedModel): def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels self.distilbert = DistilBertModel(config) self.pre_classifier = nn.Linear(config.dim, config.dim) self.classifier = nn.Linear(config.dim, config.num_labels) self.dropout = nn.Dropout(config.seq_classif_dropout) self.init_weights() @add_start_docstrings_to_model_forward(DISTILBERT_INPUTS_DOCSTRING.format("batch_size, num_choices")) @add_code_sample_docstrings( tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint=_CHECKPOINT_FOR_DOC, output_type=SequenceClassifierOutput, config_class=_CONFIG_FOR_DOC, ) def forward( self, input_ids=None, attention_mask=None, head_mask=None, inputs_embeds=None, labels=None, output_attentions=None, output_hidden_states=None, return_dict=None, ): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`): Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ..., config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss), If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ return_dict = return_dict if return_dict is not None else self.config.use_return_dict distilbert_output = self.distilbert( input_ids=input_ids, attention_mask=attention_mask, head_mask=head_mask, inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict, ) hidden_state = distilbert_output[0] # (bs, seq_len, dim) pooled_output = hidden_state[:, 0] # (bs, dim) pooled_output = self.pre_classifier(pooled_output) # (bs, dim) pooled_output = nn.ReLU()(pooled_output) # (bs, dim) pooled_output = self.dropout(pooled_output) # (bs, dim) logits = self.classifier(pooled_output) # (bs, num_labels) loss = None if labels is not None: if self.num_labels == 1: loss_fct = nn.MSELoss() loss = loss_fct(logits.view(-1), labels.view(-1)) else: loss_fct = nn.CrossEntropyLoss() loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) if not return_dict: output = (logits,) + distilbert_output[1:] return ((loss,) + output) if loss is not None else output return SequenceClassifierOutput( loss=loss, logits=logits, hidden_states=distilbert_output.hidden_states, attentions=distilbert_output.attentions, ) @add_start_docstrings( """ DistilBert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of the hidden-states output to compute `span start logits` and `span end logits`). """, DISTILBERT_START_DOCSTRING, ) class DistilBertForQuestionAnswering(DistilBertPreTrainedModel): def __init__(self, config): super().__init__(config) self.distilbert = DistilBertModel(config) self.qa_outputs = nn.Linear(config.dim, config.num_labels) assert config.num_labels == 2 self.dropout = nn.Dropout(config.qa_dropout) self.init_weights() @add_start_docstrings_to_model_forward(DISTILBERT_INPUTS_DOCSTRING.format("batch_size, num_choices")) @add_code_sample_docstrings( tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint=_CHECKPOINT_FOR_DOC, output_type=QuestionAnsweringModelOutput, config_class=_CONFIG_FOR_DOC, ) def forward( self, input_ids=None, attention_mask=None, head_mask=None, inputs_embeds=None, start_positions=None, end_positions=None, output_attentions=None, output_hidden_states=None, return_dict=None, ): r""" start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`): Labels for position (index) of the start of the labelled span for computing the token classification loss. Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the sequence are not taken into account for computing the loss. end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`): Labels for position (index) of the end of the labelled span for computing the token classification loss. Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the sequence are not taken into account for computing the loss. """ return_dict = return_dict if return_dict is not None else self.config.use_return_dict distilbert_output = self.distilbert( input_ids=input_ids, attention_mask=attention_mask, head_mask=head_mask, inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict, ) hidden_states = distilbert_output[0] # (bs, max_query_len, dim) hidden_states = self.dropout(hidden_states) # (bs, max_query_len, dim) logits = self.qa_outputs(hidden_states) # (bs, max_query_len, 2) start_logits, end_logits = logits.split(1, dim=-1) start_logits = start_logits.squeeze(-1) # (bs, max_query_len) end_logits = end_logits.squeeze(-1) # (bs, max_query_len) total_loss = None if start_positions is not None and end_positions is not None: # If we are on multi-GPU, split add a dimension if len(start_positions.size()) > 1: start_positions = start_positions.squeeze(-1) if len(end_positions.size()) > 1: end_positions = end_positions.squeeze(-1) # sometimes the start/end positions are outside our model inputs, we ignore these terms ignored_index = start_logits.size(1) start_positions.clamp_(0, ignored_index) end_positions.clamp_(0, ignored_index) loss_fct = nn.CrossEntropyLoss(ignore_index=ignored_index) start_loss = loss_fct(start_logits, start_positions) end_loss = loss_fct(end_logits, end_positions) total_loss = (start_loss + end_loss) / 2 if not return_dict: output = (start_logits, end_logits) + distilbert_output[1:] return ((total_loss,) + output) if total_loss is not None else output return QuestionAnsweringModelOutput( loss=total_loss, start_logits=start_logits, end_logits=end_logits, hidden_states=distilbert_output.hidden_states, attentions=distilbert_output.attentions, ) @add_start_docstrings( """ DistilBert Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """, DISTILBERT_START_DOCSTRING, ) class DistilBertForTokenClassification(DistilBertPreTrainedModel): def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels self.distilbert = DistilBertModel(config) self.dropout = nn.Dropout(config.dropout) self.classifier = nn.Linear(config.hidden_size, config.num_labels) self.init_weights() @add_start_docstrings_to_model_forward(DISTILBERT_INPUTS_DOCSTRING) @add_code_sample_docstrings( tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint=_CHECKPOINT_FOR_DOC, output_type=TokenClassifierOutput, config_class=_CONFIG_FOR_DOC, ) def forward( self, input_ids=None, attention_mask=None, head_mask=None, inputs_embeds=None, labels=None, output_attentions=None, output_hidden_states=None, return_dict=None, ): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`): Labels for computing the token classification loss. Indices should be in ``[0, ..., config.num_labels - 1]``. """ return_dict = return_dict if return_dict is not None else self.config.use_return_dict outputs = self.distilbert( input_ids, attention_mask=attention_mask, head_mask=head_mask, inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict, ) sequence_output = outputs[0] sequence_output = self.dropout(sequence_output) logits = self.classifier(sequence_output) loss = None if labels is not None: loss_fct = CrossEntropyLoss() # Only keep active parts of the loss if attention_mask is not None: active_loss = attention_mask.view(-1) == 1 active_logits = logits.view(-1, self.num_labels) active_labels = torch.where( active_loss, labels.view(-1), torch.tensor(loss_fct.ignore_index).type_as(labels) ) loss = loss_fct(active_logits, active_labels) else: loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) if not return_dict: output = (logits,) + outputs[1:] return ((loss,) + output) if loss is not None else output return TokenClassifierOutput( loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions, ) @add_start_docstrings( """ DistilBert Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a softmax) e.g. for RocStories/SWAG tasks. """, DISTILBERT_START_DOCSTRING, ) class DistilBertForMultipleChoice(DistilBertPreTrainedModel): def __init__(self, config): super().__init__(config) self.distilbert = DistilBertModel(config) self.pre_classifier = nn.Linear(config.dim, config.dim) self.classifier = nn.Linear(config.dim, 1) self.dropout = nn.Dropout(config.seq_classif_dropout) self.init_weights() @add_start_docstrings_to_model_forward( DISTILBERT_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length") ) @replace_return_docstrings(output_type=MultipleChoiceModelOutput, config_class=_CONFIG_FOR_DOC) def forward( self, input_ids=None, attention_mask=None, head_mask=None, inputs_embeds=None, labels=None, output_attentions=None, output_hidden_states=None, return_dict=None, ): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`): Labels for computing the multiple choice classification loss. Indices should be in ``[0, ..., num_choices-1]`` where :obj:`num_choices` is the size of the second dimension of the input tensors. (See :obj:`input_ids` above) Returns: Examples:: >>> from transformers import DistilBertTokenizer, DistilBertForMultipleChoice >>> import torch >>> tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased') >>> model = DistilBertForMultipleChoice.from_pretrained('distilbert-base-cased') >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced." >>> choice0 = "It is eaten with a fork and a knife." >>> choice1 = "It is eaten while held in the hand." >>> labels = torch.tensor(0).unsqueeze(0) # choice0 is correct (according to Wikipedia ;)), batch size 1 >>> encoding = tokenizer([[prompt, choice0], [prompt, choice1]], return_tensors='pt', padding=True) >>> outputs = model(**{k: v.unsqueeze(0) for k,v in encoding.items()}, labels=labels) # batch size is 1 >>> # the linear classifier still needs to be trained >>> loss = outputs.loss >>> logits = outputs.logits """ return_dict = return_dict if return_dict is not None else self.config.use_return_dict num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1] input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None inputs_embeds = ( inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1)) if inputs_embeds is not None else None ) outputs = self.distilbert( input_ids, attention_mask=attention_mask, head_mask=head_mask, inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict, ) hidden_state = outputs[0] # (bs * num_choices, seq_len, dim) pooled_output = hidden_state[:, 0] # (bs * num_choices, dim) pooled_output = self.pre_classifier(pooled_output) # (bs * num_choices, dim) pooled_output = nn.ReLU()(pooled_output) # (bs * num_choices, dim) pooled_output = self.dropout(pooled_output) # (bs * num_choices, dim) logits = self.classifier(pooled_output) # (bs * num_choices, 1) reshaped_logits = logits.view(-1, num_choices) # (bs, num_choices) loss = None if labels is not None: loss_fct = CrossEntropyLoss() loss = loss_fct(reshaped_logits, labels) if not return_dict: output = (reshaped_logits,) + outputs[1:] return ((loss,) + output) if loss is not None else output return MultipleChoiceModelOutput( loss=loss, logits=reshaped_logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions, ) ================================================ FILE: ros/CMakeLists.txt ================================================ cmake_minimum_required(VERSION 3.5) project(jetson_voice_ros) # Default to C99 if(NOT CMAKE_C_STANDARD) set(CMAKE_C_STANDARD 99) endif() # Default to C++14 if(NOT CMAKE_CXX_STANDARD) set(CMAKE_CXX_STANDARD 14) endif() if(CMAKE_COMPILER_IS_GNUCXX OR CMAKE_CXX_COMPILER_ID MATCHES "Clang") add_compile_options(-Wall -Wextra -Wpedantic) endif() # find dependencies find_package(ament_cmake REQUIRED) find_package(ament_cmake_python REQUIRED) find_package(std_msgs REQUIRED) find_package(rosidl_default_generators REQUIRED) # generate messages rosidl_generate_interfaces(${PROJECT_NAME} "msg/Audio.msg" "msg/AudioInfo.msg" "msg/IntentSlot.msg" "msg/QuestionAnswerQuery.msg" "msg/QuestionAnswerResult.msg" "msg/Slot.msg" DEPENDENCIES std_msgs ) # install Python modules ament_python_install_package(${PROJECT_NAME}) # install Python executables file(GLOB python_nodes ${PROJECT_NAME}/*.py) install(PROGRAMS ${python_nodes} DESTINATION lib/${PROJECT_NAME} ) # install launch files install(DIRECTORY launch DESTINATION share/${PROJECT_NAME}/ ) if(BUILD_TESTING) find_package(ament_lint_auto REQUIRED) # the following line skips the linter which checks for copyrights # uncomment the line when a copyright and license is not present in all source files #set(ament_cmake_copyright_FOUND TRUE) # the following line skips cpplint (only works in a git repo) # uncomment the line when this package is not in a git repo #set(ament_cmake_cpplint_FOUND TRUE) ament_lint_auto_find_test_dependencies() endif() ament_package() ================================================ FILE: ros/jetson_voice_ros/__init__.py ================================================ ================================================ FILE: ros/jetson_voice_ros/asr.py ================================================ #!/usr/bin/env python3 import os import rclpy import numpy as np from rclpy.node import Node from std_msgs.msg import String from jetson_voice import ASR from jetson_voice_ros.msg import Audio class ASRNode(Node): def __init__(self): super().__init__('asr', namespace='voice') # create topics self.audio_subscriber = self.create_subscription(Audio, 'audio_in', self.audio_listener, 10) self.transcript_publisher = self.create_publisher(String, 'transcripts', 10) self.partial_transcript_publisher = self.create_publisher(String, 'partial_transcripts', 10) # get node parameters self.declare_parameter('model', 'quartznet') self.model_name = str(self.get_parameter('model').value) self.get_logger().info(f'model = {self.model_name}') # load the ASR model self.asr = ASR(self.model_name) self.get_logger().info(f"model '{self.model_name}' ready") if self.asr.classification: raise ValueError(f'jetson_voice_ros/asr node does not support ASR classification models') def audio_listener(self, msg): if msg.info.sample_rate != self.asr.sample_rate: self.get_logger().warning(f"audio has sample_rate {msg.info.sample_rate}, but ASR expects sample_rate {self.asr.sample_rate}") samples = np.frombuffer(msg.data, dtype=msg.info.sample_format) self.get_logger().debug(f'recieved audio samples {samples.shape} dtype={samples.dtype}') # rms={np.sqrt(np.mean(samples**2))}') results = self.asr(samples) for transcript in results: text = transcript['text'].strip() if len(text) == 0: continue msg = String() msg.data = text self.get_logger().info(f"transcript: {text}") if transcript['end']: self.transcript_publisher.publish(msg) self.partial_transcript_publisher.publish(msg) def main(args=None): rclpy.init(args=args) node = ASRNode() rclpy.spin(node) node.destroy_node() rclpy.shutdown() if __name__ == "__main__": main() ================================================ FILE: ros/jetson_voice_ros/audio_input.py ================================================ #!/usr/bin/env python3 import os import rclpy import numpy as np from rclpy.node import Node from jetson_voice.utils import AudioInput, audio_to_int16 from jetson_voice_ros.msg import Audio class AudioInputNode(Node): def __init__(self): super().__init__('audio_input', namespace='voice') # create topics self.audio_publisher = self.create_publisher(Audio, 'audio_in', 10) # get node parameters self.declare_parameter('device', '') # input audio device ID or name self.declare_parameter('sample_rate', 16000) # sample rate (in Hz) self.declare_parameter('chunk_size', 16000) # number of samples per buffer self.declare_parameter('resets', -1) # number of times to reset the device (-1 is infinite) self.device_name = str(self.get_parameter('device').value) self.sample_rate = self.get_parameter('sample_rate').value self.chunk_size = self.get_parameter('chunk_size').value self.resets = self.get_parameter('resets').value self.reset_count = 0 if self.device_name == '': raise ValueError("must set the 'device' parameter to either an input audio device ID/name or the path to a .wav file") self.get_logger().info(f'device={self.device_name}') self.get_logger().info(f'sample_rate={self.sample_rate}') self.get_logger().info(f'chunk_size={self.chunk_size}') self.get_logger().info(f'resets={self.resets}') # check if this is an audio device or a wav file file_ext = os.path.splitext(self.device_name)[1].lower() if file_ext == '.wav' or file_ext == '.wave': wav = self.device_name mic = '' else: wav = '' mic = self.device_name # create audio device self.device = AudioInput(wav=wav, mic=mic, sample_rate=self.sample_rate, chunk_size=self.chunk_size) self.device.open() # create a timer to check for audio samples self.timer = self.create_timer(self.chunk_size / self.sample_rate * 0.75, self.publish_audio) def publish_audio(self): while True: samples = self.device.next() if samples is not None: break self.get_logger().warning('no audio samples were returned from the audio device') if self.resets < 0 or self.reset_count < self.resets: self.reset_count += 1 self.get_logger().warning(f'resetting audio device {self.device_name} (attempt {self.reset_count} of {self.resets})') self.device.reset() else: self.get_logger().error(f'maximum audio device resets has been reached ({self.resets})') return if samples.dtype == np.float32: # convert to int16 to make the message smaller samples = audio_to_int16(samples) if samples.dtype != np.int16: # the other voice nodes expect int16/float32 raise ValueError(f'audio samples are expected to have datatype int16, but they were {samples.dtype}') self.get_logger().debug(f'publishing audio samples {samples.shape} dtype={samples.dtype}') # rms={np.sqrt(np.mean(samples**2))}') # publish message msg = Audio() msg.header.stamp = self.get_clock().now().to_msg() msg.header.frame_id = self.device_name msg.info.channels = 1 # AudioInput is set to mono msg.info.sample_rate = self.sample_rate msg.info.sample_format = str(samples.dtype) msg.data = samples.tobytes() self.audio_publisher.publish(msg) def main(args=None): rclpy.init(args=args) node = AudioInputNode() rclpy.spin(node) node.destroy_node() rclpy.shutdown() if __name__ == "__main__": main() ================================================ FILE: ros/jetson_voice_ros/audio_output.py ================================================ #!/usr/bin/env python3 import os import rclpy import numpy as np from rclpy.node import Node from jetson_voice import AudioOutput from jetson_voice_ros.msg import Audio from soundfile import SoundFile class AudioOutputNode(Node): def __init__(self): super().__init__('audio_output', namespace='voice') # create topics self.audio_subscriber = self.create_subscription(Audio, 'audio_out', self.audio_listener, 10) # get node parameters self.declare_parameter('device', '') # input audio device ID or name self.declare_parameter('sample_rate', 16000) # sample rate (in Hz) self.declare_parameter('chunk_size', 4096) # number of samples per buffer self.device_name = str(self.get_parameter('device').value) self.sample_rate = self.get_parameter('sample_rate').value self.chunk_size = self.get_parameter('chunk_size').value if self.device_name == '': raise ValueError("must set the 'device' parameter to either an input audio device ID/name or the path to a .wav file") self.get_logger().info(f'device={self.device_name}') self.get_logger().info(f'sample_rate={self.sample_rate}') self.get_logger().info(f'chunk_size={self.chunk_size}') # check if this is an audio device or a wav file file_ext = os.path.splitext(self.device_name)[1].lower() if file_ext == '.wav' or file_ext == '.wave': self.wav = SoundFile(self.device_name, mode='w', samplerate=self.sample_rate, channels=1) self.device = None else: self.wav = None self.device = AudioOutput(self.device_name, sample_rate=self.sample_rate, chunk_size=self.chunk_size) def audio_listener(self, msg): #self.get_logger().debug('recieved new audio message') #self.get_logger().debug(f'{msg.header}') #self.get_logger().debug(f'{msg.info}') if msg.info.sample_rate != self.sample_rate: self.get_logger().warning(f"audio has sample_rate {msg.info.sample_rate}, but audio device is using sample_rate {self.sample_rate}") samples = np.frombuffer(msg.data, dtype=msg.info.sample_format) self.get_logger().debug(f'recieved audio samples {samples.shape} dtype={samples.dtype}') # rms={np.sqrt(np.mean(samples**2))}') if self.device is not None: self.device.write(samples) else: self.wav.write(samples) def main(args=None): rclpy.init(args=args) node = AudioOutputNode() rclpy.spin(node) node.destroy_node() rclpy.shutdown() if __name__ == "__main__": main() ================================================ FILE: ros/jetson_voice_ros/nlp_intent_slot.py ================================================ #!/usr/bin/env python3 import os import rclpy from rclpy.node import Node from std_msgs.msg import String from jetson_voice import IntentSlot as IntentSlotFactory from jetson_voice_ros.msg import IntentSlot, Slot class NLPIntentSlotNode(Node): def __init__(self): super().__init__('nlp_intent_slot', namespace='voice') # create topics self.query_subscriber = self.create_subscription(String, 'intent_slot_query', self.query_listener, 10) self.result_publisher = self.create_publisher(IntentSlot, 'intent_slot_results', 10) # get node parameters self.declare_parameter('model', 'distilbert_intent') self.model_name = str(self.get_parameter('model').value) self.get_logger().info(f'model = {self.model_name}') # load the IntentSlot model self.model = IntentSlotFactory(self.model_name) self.get_logger().info(f"model '{self.model_name}' ready") def query_listener(self, msg): text = msg.data.strip() if len(text) == 0: return self.get_logger().info(f"running NLP Intent/Slot query: '{text}'") # run the model results = self.model(text) self.get_logger().info(f"intent: '{results['intent']}'") self.get_logger().info(f"score: {results['score']}") for slot in results['slots']: self.get_logger().info(str(slot)) # create message msg = IntentSlot() msg.query.data = text msg.intent.data = results['intent'] msg.score = float(results['score']) slots = [] for slot in results['slots']: slot_msg = Slot() slot_msg.slot.data = slot['slot'] slot_msg.text.data = slot['text'] slot_msg.score = float(slot['score']) slots.append(slot_msg) msg.slots = tuple(slots) # publish message self.result_publisher.publish(msg) def main(args=None): rclpy.init(args=args) node = NLPIntentSlotNode() rclpy.spin(node) node.destroy_node() rclpy.shutdown() if __name__ == "__main__": main() ================================================ FILE: ros/jetson_voice_ros/nlp_question_answer.py ================================================ #!/usr/bin/env python3 import os import rclpy from rclpy.node import Node from std_msgs.msg import String from jetson_voice import QuestionAnswer as QuestionAnswerFactory from jetson_voice_ros.msg import QuestionAnswerQuery, QuestionAnswerResult class NLPQuestionAnswerNode(Node): def __init__(self): super().__init__('nlp_question_answer', namespace='voice') # create topics self.query_subscriber = self.create_subscription(QuestionAnswerQuery, 'question_answer_query', self.query_listener, 10) self.result_publisher = self.create_publisher(QuestionAnswerResult, 'question_answer_results', 10) # get node parameters self.declare_parameter('model', 'distilbert_qa_384') self.model_name = str(self.get_parameter('model').value) self.get_logger().info(f'model = {self.model_name}') # load the QA model self.model = QuestionAnswerFactory(self.model_name) self.get_logger().info(f"model '{self.model_name}' ready") def query_listener(self, msg): question = msg.question.data.strip() context = msg.context.data.strip() if len(question) == 0 or len(context) == 0: return self.get_logger().info(f"running NLP Question/Answer query:") self.get_logger().info(f"question: '{question}'") self.get_logger().info(f"context:") self.get_logger().info(context) # run the model results = self.model((question,context)) self.get_logger().info(f"answer: '{results['answer']}'") self.get_logger().info(f"score: {results['score']}") # create message msg = QuestionAnswerResult() msg.question.data = question msg.answer.data = results['answer'] msg.score = results['score'] # publish message self.result_publisher.publish(msg) def main(args=None): rclpy.init(args=args) node = NLPQuestionAnswerNode() rclpy.spin(node) node.destroy_node() rclpy.shutdown() if __name__ == "__main__": main() ================================================ FILE: ros/jetson_voice_ros/tts.py ================================================ #!/usr/bin/env python3 import os import rclpy import numpy as np from rclpy.node import Node from std_msgs.msg import String from jetson_voice import TTS from jetson_voice.utils import audio_to_int16 from jetson_voice_ros.msg import Audio class TTSNode(Node): def __init__(self): super().__init__('tts', namespace='voice') # create topics self.text_subscriber = self.create_subscription(String, 'tts_text', self.text_listener, 10) self.audio_publisher = self.create_publisher(Audio, 'tts_audio', 10) # get node parameters self.declare_parameter('model', 'fastpitch_hifigan') self.model_name = str(self.get_parameter('model').value) self.get_logger().info(f'model = {self.model_name}') # load the TTS model self.tts = TTS(self.model_name) self.get_logger().info(f"model '{self.model_name}' ready") def text_listener(self, msg): text = msg.data.strip() if len(text) == 0: return self.get_logger().info(f"running TTS on '{text}'") samples = self.tts(text) samples = audio_to_int16(samples) # publish message msg = Audio() msg.header.stamp = self.get_clock().now().to_msg() msg.header.frame_id = self.model_name msg.info.channels = 1 msg.info.sample_rate = self.tts.sample_rate msg.info.sample_format = str(samples.dtype) msg.data = samples.tobytes() self.audio_publisher.publish(msg) def main(args=None): rclpy.init(args=args) node = TTSNode() rclpy.spin(node) node.destroy_node() rclpy.shutdown() if __name__ == "__main__": main() ================================================ FILE: ros/launch/asr.launch.py ================================================ # # Launch file for playback of an audio stream or wav file. # import os from launch import LaunchDescription from launch.actions import IncludeLaunchDescription, DeclareLaunchArgument from launch.launch_description_sources import PythonLaunchDescriptionSource from launch.substitutions import ThisLaunchFileDir, LaunchConfiguration from launch_ros.actions import Node def generate_launch_description(): log_level = DeclareLaunchArgument('log_level', default_value='info') asr_model = DeclareLaunchArgument('model', default_value='quartznet') input_device = DeclareLaunchArgument('input_device', default_value='/jetson-voice/data/audio/dusty.wav') audio_input = Node(package='jetson_voice_ros', node_executable='audio_input.py', parameters=[ {"device": LaunchConfiguration('input_device')}, ], arguments=['--ros-args', '--log-level', LaunchConfiguration('log_level')], output='screen', emulate_tty=True) asr_node = Node(package='jetson_voice_ros', node_executable='asr.py', parameters=[ {"model": LaunchConfiguration('model')}, ], arguments=['--ros-args', '--log-level', LaunchConfiguration('log_level')], output='screen', emulate_tty=True) return LaunchDescription([ log_level, asr_model, input_device, audio_input, asr_node, ]) ================================================ FILE: ros/launch/audio_playback.launch.py ================================================ # # Launch file for playback of an audio stream or wav file. # import os from launch import LaunchDescription from launch.actions import IncludeLaunchDescription, DeclareLaunchArgument from launch.launch_description_sources import PythonLaunchDescriptionSource from launch.substitutions import ThisLaunchFileDir, LaunchConfiguration from launch_ros.actions import Node def generate_launch_description(): log_level = DeclareLaunchArgument('log_level', default_value='info') input_device = DeclareLaunchArgument('input_device', default_value='/jetson-voice/data/audio/dusty.wav') output_device = DeclareLaunchArgument('output_device', default_value='/jetson-voice/data/audio/output.wav') audio_input = Node(package='jetson_voice_ros', node_executable='audio_input.py', parameters=[ {"device": LaunchConfiguration('input_device')}, ], arguments=['--ros-args', '--log-level', LaunchConfiguration('log_level')], output='screen', emulate_tty=True) audio_output = Node(package='jetson_voice_ros', node_executable='audio_output.py', parameters=[ {"device": LaunchConfiguration('output_device')}, ], remappings=[ ("/voice/audio_out", "/voice/audio_in"), ], arguments=['--ros-args', '--log-level', LaunchConfiguration('log_level')], output='screen', emulate_tty=True) return LaunchDescription([ log_level, input_device, output_device, audio_input, audio_output, ]) ================================================ FILE: ros/launch/tts.launch.py ================================================ # # Launch file for playback of an audio stream or wav file. # import os from launch import LaunchDescription from launch.actions import IncludeLaunchDescription, DeclareLaunchArgument from launch.launch_description_sources import PythonLaunchDescriptionSource from launch.substitutions import ThisLaunchFileDir, LaunchConfiguration from launch_ros.actions import Node def generate_launch_description(): log_level = DeclareLaunchArgument('log_level', default_value='info') tts_model = DeclareLaunchArgument('model', default_value='fastpitch_hifigan') output_device = DeclareLaunchArgument('output_device', default_value='/jetson-voice/data/audio/tts_test.wav') tts_node = Node(package='jetson_voice_ros', node_executable='tts.py', parameters=[ {"model": LaunchConfiguration('model')}, ], arguments=['--ros-args', '--log-level', LaunchConfiguration('log_level')], output='screen', emulate_tty=True) audio_output = Node(package='jetson_voice_ros', node_executable='audio_output.py', parameters=[ {"device": LaunchConfiguration('output_device')}, {"sample_rate": 22050}, ], remappings=[ ("/voice/audio_out", "/voice/tts_audio"), ], arguments=['--ros-args', '--log-level', LaunchConfiguration('log_level')], output='screen', emulate_tty=True) return LaunchDescription([ log_level, tts_model, output_device, tts_node, audio_output, ]) ================================================ FILE: ros/msg/Audio.msg ================================================ std_msgs/Header header AudioInfo info uint8[] data ================================================ FILE: ros/msg/AudioInfo.msg ================================================ # Number of channels uint8 channels # Sampling rate [Hz] uint32 sample_rate # Audio format (e.g. int16, float32) string sample_format # Audio coding format (e.g. wav, mp3) string coding_format ================================================ FILE: ros/msg/IntentSlot.msg ================================================ # the original query text std_msgs/String query # the classified intent label std_msgs/String intent # the intent probability between [0,1] float32 score # list of slots jetson_voice_ros/Slot[] slots ================================================ FILE: ros/msg/QuestionAnswerQuery.msg ================================================ # the question being asked std_msgs/String question # the context paragraph std_msgs/String context ================================================ FILE: ros/msg/QuestionAnswerResult.msg ================================================ # the question that was asked std_msgs/String question # the answer to the question std_msgs/String answer # the confidence of the answer betweeen [0,1] float32 score ================================================ FILE: ros/msg/Slot.msg ================================================ # the slot class label std_msgs/String slot # the relevant text from the original query std_msgs/String text # classification probability between [0,1] float32 score ================================================ FILE: ros/package.xml ================================================ jetson_voice_ros 0.0.0 ROS2 nodes for jetson_voice Dustin Franklin MIT rclpy std_msgs ament_cmake ament_cmake_python rosidl_default_generators rosidl_default_runtime rosidl_interface_packages ament_lint_auto ament_lint_common ament_cmake ================================================ FILE: scripts/list_audio_devices.py ================================================ #!/usr/bin/env python3 # coding: utf-8 from jetson_voice import list_audio_devices list_audio_devices() ================================================ FILE: scripts/list_models.py ================================================ #!/usr/bin/env python3 # coding: utf-8 from jetson_voice import list_models list_models() ================================================ FILE: scripts/nemo_export_onnx.py ================================================ #!/usr/bin/env python3 # coding: utf-8 import os import argparse import pprint import json import nemo import nemo.collections.asr as nemo_asr import nemo.collections.nlp as nemo_nlp import nemo.collections.tts as nemo_tts from omegaconf import OmegaConf, open_dict model_types = { 'asr' : nemo_asr.models.ASRModel, 'asr_classification' : nemo_asr.models.ASRModel, 'qa' : nemo_nlp.models.QAModel, 'intent_slot' : nemo_nlp.models.IntentSlotClassificationModel, 'text_classification' : nemo_nlp.models.TextClassificationModel, 'token_classification' : nemo_nlp.models.TokenClassificationModel } parser = argparse.ArgumentParser() parser.add_argument('--type', choices=model_types.keys(), type=str, required=True) parser.add_argument('--model', type=str, required=True) # 'QuartzNet15x5Base-En' parser.add_argument('--output', default='', type=str, required=True) args = parser.parse_args() print('nemo version:', nemo.__version__) # load model depending on extension/type extension = os.path.splitext(args.model)[1].lower() if extension == '.nemo': model = model_types[args.type].restore_from(args.model) elif extension == '.ckpt': model = model_types[args.type].load_from_checkpoint(args.model) else: #elif: len(extension) == 0: model = model_types[args.type].from_pretrained(model_name=args.model) #else: # raise ValueError(f'model {args.model} has invalid extension {extension}') # add type string so we can more easily track this later with open_dict(model._cfg): model._cfg.type = args.type model._cfg.model_path = os.path.basename(args.output) model._cfg.model_origin = args.model print('') print('###############################################') print('## Model Config') print('###############################################') pprint.pprint(OmegaConf.to_container(model._cfg)) print('') base_path = os.path.splitext(args.output)[0] json_path = base_path + '.json' yaml_path = base_path + '.yaml' #with open(yaml_path, 'w') as yaml_file: # OmegaConf.save(config=model._cfg, f=yaml_file) # print('saved model config to {:s}'.format(yaml_path)) with open(json_path, 'w') as json_file: json.dump(OmegaConf.to_container(model._cfg), json_file, indent=3) print('saved model config to {:s}'.format(json_path)) model.export(args.output, verbose=True) print('\nexported {:s} to {:s}'.format(args.model, args.output)) ================================================ FILE: scripts/nemo_list_models.py ================================================ #!/usr/bin/env python3 # coding: utf-8 import nemo import nemo.collections.asr as nemo_asr import nemo.collections.nlp as nemo_nlp import nemo.collections.tts as nemo_tts print('nemo version:', nemo.__version__) asr_archs = [model for model in dir(nemo_asr.models) if model.endswith("Model")] nlp_archs = [model for model in dir(nemo_nlp.models) if model.endswith("Model")] tts_archs = [model for model in dir(nemo_tts.models) if model.endswith("Model")] print('ASR architectures:', asr_archs) print('NLP architectures:', nlp_archs) print('TTS architectures:', tts_archs) for asr_arch in asr_archs: print('') print('#####################################################') print('## nemo_asr.models.{:s}'.format(asr_arch)) print('#####################################################') print(getattr(nemo_asr.models, asr_arch).list_available_models()) for nlp_arch in nlp_archs: print('') print('#####################################################') print('## nemo_nlp.models.{:s}'.format(nlp_arch)) print('#####################################################') print(getattr(nemo_nlp.models, nlp_arch).list_available_models()) print('') print('#####################################################') print('## nemo_nlp.models.pretrained_lm_models') print('#####################################################') for model in nemo_nlp.modules.get_pretrained_lm_models_list(): print(model) for tts_arch in tts_archs: print('') print('#####################################################') print('## nemo_tts.models.{:s}'.format(tts_arch)) print('#####################################################') print(getattr(nemo_tts.models, tts_arch).list_available_models()) ================================================ FILE: scripts/nemo_train_classifier.py ================================================ #!/usr/bin/env python3 # coding: utf-8 import os import argparse import torch import pytorch_lightning as pl from omegaconf import OmegaConf from nemo.utils.exp_manager import exp_manager from nemo.collections import nlp as nemo_nlp """ Example SST2 'Stanford Sentiment Treebank' dataset from: https://gluebenchmark.com/tasks https://dl.fbaipublicfiles.com/glue/data/SST-2.zip Pre-processing commands: sed 1d train.tsv > train_nemo_format.tsv sed 1d test.tsv > test_nemo_format.tsv sed 1d dev.tsv > dev_nemo_format.tsv """ # parse args parser = argparse.ArgumentParser() parser.add_argument('--dataset', default='datasets/sentiment/SST-2', type=str) parser.add_argument('--config', default='config/text_classification_config.yaml', type=str) parser.add_argument('--model', default='distilbert-base-uncased', type=str) # "bert-base-uncased" parser.add_argument('--classes', default=2, type=int) parser.add_argument('--epochs', default=5, type=int) parser.add_argument('--samples', default=-1, type=int) parser.add_argument('--batch-size', default=32, type=int) parser.add_argument('--learning-rate', '--lr', default=0.00002, type=float) parser.add_argument('--max-seq-length', default=128, type=int) args = parser.parse_args() print(args) # load config config = OmegaConf.load(args.config) print(f'loaded config from {args.config}') # setup config config.model.train_ds.file_path = os.path.join(args.dataset, 'train_nemo_format.tsv') config.model.validation_ds.file_path = os.path.join(args.dataset, 'dev_nemo_format.tsv') config.model.test_ds.file_path = os.path.join(args.dataset, 'test_nemo_format.tsv') config.model.dataset.num_classes=2 config.model.dataset.max_seq_length = args.max_seq_length config.model.language_model.pretrained_model_name = args.model config.model.tokenizer.tokenizer_name = args.model config.model.train_ds.batch_size = args.batch_size config.model.validation_ds.batch_size = args.batch_size config.model.test_ds.batch_size = args.batch_size if args.samples > 0: config.model.train_ds.num_samples = args.samples config.model.validation_ds.num_samples = args.samples config.model.test_ds.num_samples = args.samples config.model.optim.lr = args.learning_rate config.trainer.gpus = 1 if torch.cuda.is_available() else 0 config.trainer.precision = 16 if torch.cuda.is_available() else 32 # For mixed precision training, use precision=16 and amp_level=O1 config.trainer.max_epochs = args.epochs config.trainer.accelerator = None # Remove distributed training flags print(OmegaConf.to_yaml(config)) # create trainer + model trainer = pl.Trainer(**config.trainer) model = nemo_nlp.models.TextClassificationModel(config.model, trainer=trainer) exp_dir = str(exp_manager(trainer, config.get("exp_manager", None))) print('experiment directory:', exp_dir) # start the training trainer.fit(model) # test the model eval_checkpoint_path = trainer.checkpoint_callback.best_model_path eval_model = nemo_nlp.models.TextClassificationModel.load_from_checkpoint(checkpoint_path=eval_checkpoint_path) print('loaded checkpoint for eval:', eval_checkpoint_path) eval_model.setup_test_data(test_data_config=config.model.validation_ds) trainer.test(model=eval_model, ckpt_path=None, verbose=True) # example inference queries = [ 'by the end of no such thing the audience , like beatrice , has a watchful affection for the monster .', 'director rob marshall went out gunning to make a great one .', 'uneasy mishmash of styles and genres .' ] results = eval_model.classifytext( queries=queries, batch_size=len(queries), max_seq_length=config.model.dataset.max_seq_length ) pred_intents, pred_slots = eval_model.predict_from_examples(queries, config.model.validation_ds) print('The prediction results of some sample queries with the trained model:') for query, result in zip(queries, results): print(f'Query : {query}') print(f'Predicted label: {result}') print('\ndone training:', exp_dir) ================================================ FILE: scripts/nemo_train_intent.py ================================================ #!/usr/bin/env python3 # coding: utf-8 import os import argparse import torch import pytorch_lightning as pl from omegaconf import OmegaConf from nemo.utils.exp_manager import exp_manager from nemo.collections import nlp as nemo_nlp """ Example dataset from: https://github.com/xliuhw/NLU-Evaluation-Data https://github.com/xliuhw/NLU-Evaluation-Data/archive/master.zip Command used to pre-process the data: python3 intent_import_datasets.py \ --dataset_name=assistant \ --source_data_dir=datasets/intent/NLU-Evaluation-Data-master \ --target_data_dir=datasets/intent/NLU-Evaluation-Data-master/nemo_format """ # parse args parser = argparse.ArgumentParser() parser.add_argument('--dataset', default='data/datasets/NLU-Evaluation-Data-master/nemo_format', type=str) parser.add_argument('--config', default='data/config/training/intent_slot_classification_config.yaml', type=str) parser.add_argument('--exp-dir', default='data/nemo_experiments', type=str) parser.add_argument('--model', default='distilbert-base-uncased', type=str) # "bert-base-uncased" parser.add_argument('--epochs', default=5, type=int) parser.add_argument('--samples', default=-1, type=int) parser.add_argument('--batch-size', default=32, type=int) parser.add_argument('--learning-rate', '--lr', default=0.00002, type=float) parser.add_argument('--max-seq-length', default=50, type=int) args = parser.parse_args() print(args) # load config config = OmegaConf.load(args.config) print(f'loaded config from {args.config}') # setup config config.model.data_dir = args.dataset #os.path.join(args.dataset, 'nemo_format') config.model.language_model.max_seq_length = args.max_seq_length config.model.language_model.pretrained_model_name = args.model config.model.tokenizer.tokenizer_name = args.model config.model.train_ds.batch_size = args.batch_size config.model.validation_ds.batch_size = args.batch_size config.model.test_ds.batch_size = args.batch_size if args.samples > 0: config.model.train_ds.num_samples = args.samples config.model.validation_ds.num_samples = args.samples config.model.test_ds.num_samples = args.samples config.model.optim.lr = args.learning_rate config.trainer.gpus = 1 if torch.cuda.is_available() else 0 config.trainer.precision = 16 if torch.cuda.is_available() else 32 # For mixed precision training, use precision=16 and amp_level=O1 config.trainer.max_epochs = args.epochs config.trainer.accelerator = None # Remove distributed training flags print(OmegaConf.to_yaml(config)) # create trainer + model trainer = pl.Trainer(**config.trainer) model = nemo_nlp.models.IntentSlotClassificationModel(config.model, trainer=trainer) # set experiment directory exp_cfg = config.get('exp_manager', None) exp_cfg['exp_dir'] = args.exp_dir exp_dir = str(exp_manager(trainer, exp_cfg)) print('experiment directory:', exp_dir) # start the training trainer.fit(model) # test the model eval_checkpoint_path = trainer.checkpoint_callback.best_model_path eval_model = nemo_nlp.models.IntentSlotClassificationModel.load_from_checkpoint(checkpoint_path=eval_checkpoint_path) print('loaded checkpoint for eval:', eval_checkpoint_path) eval_model.setup_test_data(test_data_config=config.model.test_ds) trainer.test(model=eval_model, ckpt_path=None, verbose=True) # example inference queries = [ 'set alarm for seven thirty am', 'lower volume by fifty percent', 'what is my schedule for tomorrow', ] pred_intents, pred_slots = eval_model.predict_from_examples(queries, config.model.test_ds) print('The prediction results of some sample queries with the trained model:') for query, intent, slots in zip(queries, pred_intents, pred_slots): print(f'Query : {query}') print(f'Predicted Intent: {intent}') print(f'Predicted Slots: {slots}') print('\ndone training:', exp_dir) ================================================ FILE: scripts/nemo_train_ner.py ================================================ #!/usr/bin/env python3 # coding: utf-8 import os import argparse import torch import pytorch_lightning as pl from omegaconf import OmegaConf from nemo.utils.exp_manager import exp_manager from nemo.collections import nlp as nemo_nlp """ Example GMB (Groningen Meaning Bank) dataset from: https://dldata-public.s3.us-east-2.amazonaws.com/gmb_v_2.2.0_clean.zip This version of the dataset is already pre-processed, but other IOB format data can be converted using the ner_import_iob.py tool. """ # parse args parser = argparse.ArgumentParser() parser.add_argument('--dataset', default='datasets/ner/gmb_v_2.2.0_clean', type=str) parser.add_argument('--config', default='config/token_classification_config.yaml', type=str) parser.add_argument('--model', default='distilbert-base-uncased', type=str) # "bert-base-uncased" parser.add_argument('--epochs', default=5, type=int) parser.add_argument('--samples', default=-1, type=int) parser.add_argument('--batch-size', default=32, type=int) parser.add_argument('--learning-rate', '--lr', default=0.00005, type=float) parser.add_argument('--max-seq-length', default=128, type=int) args = parser.parse_args() print(args) # load config config = OmegaConf.load(args.config) print(f'loaded config from {args.config}') # setup config config.model.dataset.data_dir = args.dataset config.model.dataset.max_seq_length = args.max_seq_length config.model.language_model.pretrained_model_name = args.model config.model.tokenizer.tokenizer_name = args.model config.model.train_ds.batch_size = args.batch_size config.model.validation_ds.batch_size = args.batch_size config.model.test_ds.batch_size = args.batch_size if args.samples > 0: config.model.train_ds.num_samples = args.samples config.model.validation_ds.num_samples = args.samples config.model.test_ds.num_samples = args.samples config.model.optim.lr = args.learning_rate config.trainer.gpus = 1 if torch.cuda.is_available() else 0 config.trainer.precision = 16 if torch.cuda.is_available() else 32 # For mixed precision training, use precision=16 and amp_level=O1 config.trainer.max_epochs = args.epochs config.trainer.accelerator = None # Remove distributed training flags print(OmegaConf.to_yaml(config)) # create trainer + model trainer = pl.Trainer(**config.trainer) model = nemo_nlp.models.TokenClassificationModel(config.model, trainer=trainer) exp_dir = str(exp_manager(trainer, config.get("exp_manager", None))) print('experiment directory:', exp_dir) # start the training trainer.fit(model) # test the model eval_checkpoint_path = trainer.checkpoint_callback.best_model_path eval_model = nemo_nlp.models.TokenClassificationModel.load_from_checkpoint(checkpoint_path=eval_checkpoint_path) print('loaded checkpoint for eval:', eval_checkpoint_path) eval_model.setup_test_data(test_data_config=config.model.test_ds) trainer.test(model=eval_model, ckpt_path=None, verbose=True) # example inference eval_model.evaluate_from_file( text_file=os.path.join(args.dataset, 'text_dev.txt'), labels_file=os.path.join(args.dataset, 'labels_dev.txt'), output_dir=exp_dir, ) print('\ndone training:', exp_dir) ================================================ FILE: scripts/nemo_train_qa.py ================================================ #!/usr/bin/env python3 # coding: utf-8 import os import argparse import torch import pytorch_lightning as pl from omegaconf import OmegaConf from nemo.utils.exp_manager import exp_manager from nemo.collections import nlp as nemo_nlp # parse args parser = argparse.ArgumentParser() parser.add_argument('--dataset', default='datasets/squad', type=str) parser.add_argument('--dataset-version', default='v1.1', type=str) parser.add_argument('--config', default='config/question_answering_squad_config.yaml', type=str) parser.add_argument('--model', default='distilbert-base-uncased', type=str) # "bert-base-uncased" parser.add_argument('--epochs', default=1, type=int) parser.add_argument('--samples', default=-1, type=int) # 5000 parser.add_argument('--batch-size', default=12, type=int) parser.add_argument('--learning-rate', '--lr', default=0.00003, type=float) parser.add_argument('--max-seq-length', default=384, type=int) parser.add_argument('--output', default='', type=str) # defaults to ./nemo_experiments args = parser.parse_args() print(args) # load config config = OmegaConf.load(args.config) print(f'loaded config from {args.config}') # setup config config.model.train_ds.file = os.path.join(args.dataset, args.dataset_version, f'train-{args.dataset_version}.json') config.model.validation_ds.file = os.path.join(args.dataset, args.dataset_version, f'dev-{args.dataset_version}.json') config.model.test_ds.file = config.model.validation_ds.file config.model.language_model.pretrained_model_name = args.model config.model.tokenizer.tokenizer_name = args.model config.model.dataset.max_seq_length = args.max_seq_length if config.model.dataset.doc_stride >= config.model.dataset.max_seq_length: config.model.dataset.doc_stride = int(config.model.dataset.max_seq_length / 2) config.model.train_ds.batch_size = args.batch_size config.model.validation_ds.batch_size = args.batch_size config.model.test_ds.batch_size = args.batch_size if args.samples > 0: config.model.train_ds.num_samples = args.samples config.model.validation_ds.num_samples = args.samples config.model.test_ds.num_samples = args.samples config.model.optim.lr = args.learning_rate config.trainer.gpus = 1 if torch.cuda.is_available() else 0 config.trainer.precision = 16 if torch.cuda.is_available() else 32 # For mixed precision training, use precision=16 and amp_level=O1 config.trainer.max_epochs = args.epochs config.trainer.accelerator = None # Remove distributed training flags if args.output != '': config.exp_manager.exp_dir = args.output print(OmegaConf.to_yaml(config)) # create trainer + model trainer = pl.Trainer(**config.trainer) model = nemo_nlp.models.QAModel(cfg=config.model, trainer=trainer) exp_dir = str(exp_manager(trainer, config.get("exp_manager", None))) print('experiment directory:', exp_dir) # start the training trainer.fit(model) # test the model model.setup_test_data(test_data_config=config.model.test_ds) trainer.test(model) # example inference all_preds, all_nbests = model.inference(file=config.model.test_ds.file, output_nbest_file=os.path.join(exp_dir, 'output_prediction.json'), output_prediction_file=os.path.join(exp_dir, 'output_nbest.json'), batch_size=args.batch_size, num_samples=10) for _, item in all_preds.items(): print(f"question: {item[0]} answer: {item[1]}") print('\ndone training:', exp_dir) ================================================ FILE: scripts/os_version.sh ================================================ #!/usr/bin/env bash ARCH=$(uname -i) echo "ARCH: $ARCH" if [ $ARCH = "aarch64" ]; then L4T_VERSION_STRING=$(head -n 1 /etc/nv_tegra_release) if [ -z "$L4T_VERSION_STRING" ]; then echo "reading L4T version from \"dpkg-query --show nvidia-l4t-core\"" L4T_VERSION_STRING=$(dpkg-query --showformat='${Version}' --show nvidia-l4t-core) L4T_VERSION_ARRAY=(${L4T_VERSION_STRING//./ }) #echo ${L4T_VERSION_ARRAY[@]} #echo ${#L4T_VERSION_ARRAY[@]} L4T_RELEASE=${L4T_VERSION_ARRAY[0]} L4T_REVISION=${L4T_VERSION_ARRAY[1]} else echo "reading L4T version from /etc/nv_tegra_release" L4T_RELEASE=$(echo $L4T_VERSION_STRING | cut -f 2 -d ' ' | grep -Po '(?<=R)[^;]+') L4T_REVISION=$(echo $L4T_VERSION_STRING | cut -f 2 -d ',' | grep -Po '(?<=REVISION: )[^;]+') fi L4T_REVISION_MAJOR=${L4T_REVISION:0:1} L4T_REVISION_MINOR=${L4T_REVISION:2:1} L4T_VERSION="$L4T_RELEASE.$L4T_REVISION" echo "L4T BSP Version: L4T R$L4T_VERSION" fi ================================================ FILE: scripts/record_mic.py ================================================ #!/usr/bin/env python3 # coding: utf-8 import sys import signal import argparse from jetson_voice import AudioInput, list_audio_devices from soundfile import SoundFile parser = argparse.ArgumentParser() parser.add_argument('--mic', default=None, type=str, required=True, help='device name or number of input microphone') parser.add_argument('--output', default=None, type=str, required=True, help='path to output wav/ogg/flac file') parser.add_argument('--sample-rate', default=16000, type=int, help='sample rate (in Hz)') parser.add_argument('--list-devices', action='store_true', help='list audio input devices') args = parser.parse_args() print(args) # list audio devices if args.list_devices: list_audio_devices() sys.exit() # setup exit signal handler record = True def signal_handler(sig, frame): global record record = False print('Ctrl+C recieved, exiting...') signal.signal(signal.SIGINT, signal_handler) # create the output wav output_wav = SoundFile(args.output, mode='w', samplerate=args.sample_rate, channels=1) # create the audio device input_mic = AudioInput(mic=args.mic, sample_rate=args.sample_rate, chunk_size=4096) # loop until user exits sample_count = 0 while record: samples = input_mic.next() output_wav.write(samples) sample_count += len(samples) output_wav.close() print(f"saved {sample_count / args.sample_rate:.2f} seconds of audio to '{args.output}'") ================================================ FILE: scripts/start_jupyter.sh ================================================ #!/usr/bin/env bash jupyter lab --ip 0.0.0.0 --port 8888 --allow-root &> /var/log/jupyter.log echo "allow 10 sec for JupyterLab to start @ http://$(hostname -I | cut -d' ' -f1):8888 (password nvidia)" echo "JupterLab logging location: /var/log/jupyter.log (inside the container)" ================================================ FILE: tests/run_tests.py ================================================ #!/usr/bin/env python3 # coding: utf-8 import os import sys import json import logging import argparse import datetime import subprocess parser = argparse.ArgumentParser() parser.add_argument('--log-dir', default='', type=str, help='directory to save log files under') parser.add_argument('--tests', default='data/tests/tests.json', type=str, help='path to config file of tests') parser.add_argument('--model', default='', type=str, help='if specified, only run tests that use this model') parser.add_argument('--module', default='', type=str, help='if specified, only run tests that use this module') parser.add_argument('--config', default='', type=str, help='if specified, only run tests that use this test config') parser.add_argument('--generate', action='store_true', help='generate the expected outputs') args = parser.parse_args() if args.log_dir == '': args.log_dir = os.path.join('data/tests/logs', datetime.datetime.now().strftime("%Y%m%d_%H%M")) if not os.path.exists(args.log_dir): os.makedirs(args.log_dir) print(args) # wrapper for launching test processes def run_test(module, model, config, args=None, log_dir=None): config = os.path.join('data/tests', config) cmd = f"python3 tests/{module} --model {model} --config {config}" if args: cmd += ' ' + args print("\nrunning test:\n\t$", cmd, "\n") if log_dir: tee = f"tee {os.path.join(log_dir, os.path.splitext(os.path.basename(module))[0])}_{model}.txt" cmd = f"mkfifo pipe; {tee} < pipe & {cmd} > pipe; code=$?; rm pipe; exit $code" # https://stackoverflow.com/a/1221844 results = subprocess.run(cmd, shell=True) if results.returncode == 0: status = 'PASSED' elif results.returncode == 127: status = 'GENERATED' else: status = 'FAILED' print(f"\n{status} TEST {module} ({model}) - return code {results.returncode}\n") return status # load the config containing all the tests with open(args.tests) as config_file: test_config = json.load(config_file) # filter the tests if requested def filter_test(test): if args.model != '' and args.model != test['model']: return False if args.module != '' and args.module != test['module']: return False if args.config != '' and args.config != test['config']: return False return True test_config = [test for test in test_config if filter_test(test)] # run the tests for test in test_config: test_args = test.get('args', '') if args.generate: test_args += ' --generate' status = run_test(test['module'], test['model'], test['config'], test_args, args.log_dir) # if the test needed to generate the expected outputs, run it again if status == 'GENERATED': print('generated expected outputs, running test again...') status = run_test(test['module'], test['model'], test['config'], test.get('args'), args.log_dir) test['status'] = status # test summary passed = 0 print('') print('----------------------------------------------------') print(' TEST SUMMARY') print('----------------------------------------------------') for test in test_config: test_str = f"{test['module']} ({test['model']})" print(f"{test_str:<40} {test['status']}") if test['status'] == 'PASSED': passed += 1 print(f"\npassed {passed} of {len(test_config)} tests") print(f"saved logs to {args.log_dir}") ================================================ FILE: tests/test_asr.py ================================================ #!/usr/bin/env python3 # coding: utf-8 import os import sys import json import nltk import logging from jetson_voice import ASR, AudioInput, ConfigArgParser parser = ConfigArgParser() parser.add_argument('--model', default='quartznet', type=str, help='path to model, service name, or json config file') parser.add_argument('--config', type=str, required=True, help='path to test config file') parser.add_argument('--threshold', type=int, default=0, help='threshold for comparing actual vs expected outputs') parser.add_argument('--generate', action='store_true', help='generate the expected outputs') args = parser.parse_args() print(args) print('') print('----------------------------------------------------') print(' RUNNING TEST (ASR)') print('----------------------------------------------------') print(f' model: {args.model}') print(f' config: {args.config}') print('') # load test config with open(args.config) as config_file: test_config = json.load(config_file) # load the model asr = ASR(args.model) # list of (passed, num_outputs) tuples test_results = [] # run tests for test in test_config: stream = AudioInput(wav=test['wav'], sample_rate=asr.sample_rate, chunk_size=asr.chunk_size) outputs = [] for samples in stream: output = asr(samples) if asr.classification: print(f"class '{output[0]}' ({output[1]:.3f})") outputs.append(output[0]) else: for transcript in output: print(transcript['text']) if transcript['end']: print('') outputs.append(transcript['text']) if not asr.classification: if not transcript['end']: # pick up the last transcript outputs.append(transcript['text']) if 'outputs' not in test: test['outputs'] = {} if args.model not in test['outputs']: args.generate = True if args.generate: test['outputs'][args.model] = outputs else: expected_outputs = test['outputs'][args.model] if len(outputs) != len(expected_outputs): logging.error(f"failed test '{test['wav']}' - got {len(outputs)} outputs (expected {len(expected_outputs)})") test_results.append((0, len(expected_outputs))) continue passed = 0 for i in range(len(expected_outputs)): similarity = nltk.edit_distance(expected_outputs[i], outputs[i]) if similarity > args.threshold: logging.error(f"failed test '{test['wav']}' - similarity {similarity} exceeded threshold of {args.threshold}") logging.error( " expected: '{expected_outputs[i]}'") logging.error( " actual: '{outputs[i]}'") else: passed += 1 test_results.append((passed, len(expected_outputs))) if args.generate: print('') logging.info(f"generated expected outputs, saving to '{args.config}'") with open(args.config, 'w') as config_file: json.dump(test_config, config_file, indent=3) sys.exit(127) # test summary passed_tests = 0 passed_outputs = 0 total_outputs = 0 for passed, num_outputs in test_results: if passed == num_outputs: passed_tests += 1 passed_outputs += passed total_outputs += num_outputs print('') print('----------------------------------------------------') print(' TEST RESULTS (ASR)') print('----------------------------------------------------') print(f' model: {args.model}') print(f' config: {args.config}') print(f' passed: {passed_tests} / {len(test_config)} audio files') print(f' {passed_outputs} / {total_outputs} outputs') print('') if passed_tests != len(test_config): logging.error(f"failed test '{args.config}' with model '{args.model}'") sys.exit(1) ================================================ FILE: tests/test_nlp.py ================================================ #!/usr/bin/env python3 # coding: utf-8 import os import sys import json import nltk import pprint import logging from jetson_voice import NLP, ConfigArgParser parser = ConfigArgParser() parser.add_argument('--model', default='distilbert_qa_128', type=str, help='path to model, service name, or json config file') parser.add_argument('--config', type=str, required=True, help='path to test config file') parser.add_argument('--threshold', type=int, default=0, help='threshold for comparing actual vs expected outputs') parser.add_argument('--generate', action='store_true', help='generate the expected outputs') args = parser.parse_args() print(args) print('') print('----------------------------------------------------') print(f' RUNNING TEST (NLP)') print('----------------------------------------------------') print(f' model: {args.model}') print(f' config: {args.config}') print('') # load test config with open(args.config) as config_file: test_config = json.load(config_file) # load the model model = NLP(args.model) type = model.config.type """ if args.type == 'intent_slot': model = IntentSlot(args.model) elif args.type == 'qa': model = QuestionAnswer(args.model) elif args.type == 'text_classification': model = TextClassification(args.model) elif args.type == 'token_classification': model = TokenClassification(args.model) """ # list of (passed, num_outputs) tuples test_results = [] # run tests for test in test_config: outputs = [] if type == 'intent_slot': for query in test['queries']: results = model(query) print('') print('query:', query, '\n') pprint.pprint(results) print('') result_str = results['intent'] for slot in results['slots']: result_str += f" {slot['slot']}={slot['text']}" outputs.append(result_str) elif type == 'qa': for question in test['questions']: query = { 'question': question, 'context': test['context'] } answer = model(query, top_k=1) print('\n') print('context:', query['context']) print('') print('question:', query['question']) print('') print('answer:', answer['answer']) print('score: ', answer['score']) outputs.append(answer['answer']) elif type == 'text_classification': for query in test['queries']: results = model(query) print('') print('query:', query, '\n') pprint.pprint(results) print('') outputs.append(results['label']) elif type == 'token_classification': for query in test['queries']: results = model(query) result_str = model.tag_string(query, results) print('') print('query:', query, '\n') print(model.tag_string(query, results, scores=True)) print('') outputs.append(result_str) if 'outputs' not in test: test['outputs'] = {} if args.model not in test['outputs']: args.generate = True if args.generate: test['outputs'][args.model] = outputs else: expected_outputs = test['outputs'][args.model] if len(outputs) != len(expected_outputs): logging.error(f"failed test '{test['wav']}' - got {len(outputs)} outputs (expected {len(expected_outputs)})") test_results.append((0, len(expected_outputs))) continue passed = 0 for i in range(len(expected_outputs)): similarity = nltk.edit_distance(expected_outputs[i], outputs[i]) if similarity > args.threshold: logging.error(f"failed test - similarity {similarity} exceeded threshold of {args.threshold}") logging.error( " expected: '{expected_outputs[i]}'") logging.error( " actual: '{outputs[i]}'") else: passed += 1 test_results.append((passed, len(expected_outputs))) if args.generate: print('') logging.info(f"generated expected outputs, saving to '{args.config}'") with open(args.config, 'w') as config_file: json.dump(test_config, config_file, indent=3) sys.exit(127) # test summary passed_tests = 0 passed_outputs = 0 total_outputs = 0 for passed, num_outputs in test_results: if passed == num_outputs: passed_tests += 1 passed_outputs += passed total_outputs += num_outputs print('') print('----------------------------------------------------') print(f' TEST RESULTS (NLP)') print('----------------------------------------------------') print(f' model: {args.model}') print(f' config: {args.config}') print(f' type: {type}') print(f' passed: {passed_tests} / {len(test_config)} tests') print(f' {passed_outputs} / {total_outputs} queries') print('') if passed_tests != len(test_config): logging.error(f"failed test '{args.config}' with model '{args.model}'") sys.exit(1) ================================================ FILE: tests/test_tts.py ================================================ #!/usr/bin/env python3 # coding: utf-8 import os import sys import json import librosa import logging import datetime from jetson_voice import TTS, ConfigArgParser from soundfile import SoundFile parser = ConfigArgParser() parser.add_argument('--model', default='fastpitch_hifigan', type=str, help='path to model, service name, or json config file') parser.add_argument('--config', type=str, required=True, help='path to test config file') parser.add_argument('--rms-threshold', type=float, default=0.005, help='threshold for comparing actual vs expected RMS') parser.add_argument('--length-threshold', type=float, default=0.1, help='threshold for comparing actual vs expected audio length (in seconds)') parser.add_argument('--generate', action='store_true', help='generate the expected outputs') parser.add_argument("--output-dir", default='', help='output directory to save generated audio') args = parser.parse_args() if args.output_dir == '': args.output_dir = os.path.join('data/tests/tts', args.model, datetime.datetime.now().strftime("%Y%m%d_%H%M")) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) print(args) print('') print('----------------------------------------------------') print(' RUNNING TEST (TTS)') print('----------------------------------------------------') print(f' model: {args.model}') print(f' config: {args.config}') print('') # load test config with open(args.config) as config_file: test_config = json.load(config_file) # load the model tts = TTS(args.model) # list of (passed, num_outputs) tuples passed = 0 # run tests for idx, test in enumerate(test_config): audio = tts(test['text']) wav_path = os.path.join(args.output_dir, f"{idx}.wav") wav = SoundFile(wav_path, mode='w', samplerate=tts.sample_rate, channels=1) wav.write(audio) wav.close() actual_length = len(audio) / tts.sample_rate actual_rms = float(librosa.feature.rms(y=audio, frame_length=len(audio), center=False)[0][0]) print(f"'{test['text']}'") print(f"audio length = {actual_length}s, RMS = {actual_rms}") print(f"saved audio to '{wav_path}'\n") if 'outputs' not in test: test['outputs'] = {} if args.model not in test['outputs']: args.generate = True if args.generate: test['outputs'][args.model] = (actual_length, actual_rms) else: expected_length, expected_rms = test['outputs'][args.model] length_diff = abs(expected_length - actual_length) rms_diff = abs(expected_rms - actual_rms) if length_diff > args.length_threshold: logging.error(f"failed test - length difference of {length_diff}s exceeded threshold of {args.length_threshold} (actual={actual_length}s, expected={expected_length}s)") logging.error(f" '{test['text']}'") continue if rms_diff > args.rms_threshold: logging.error(f"failed test - RMS difference of {rms_diff} exceeded threshold of {args.rms_threshold} (actual={actual_rms}, expected={expected_rms})") logging.error(f" '{test['text']}'") continue passed += 1 if args.generate: print('') logging.info(f"generated expected outputs, saving to '{args.config}'") with open(args.config, 'w') as config_file: json.dump(test_config, config_file, indent=3) sys.exit(127) # test summary print('') print('----------------------------------------------------') print(' TEST RESULTS (TTS)') print('----------------------------------------------------') print(f' model: {args.model}') print(f' config: {args.config}') print(f' passed: {passed} / {len(test_config)}') print('') if passed != len(test_config): logging.error(f"failed test '{args.config}' with model '{args.model}'") sys.exit(1)