Repository: lmnt-com/haste Branch: master Commit: ceba32ecd373 Files: 102 Total size: 762.7 KB Directory structure: gitextract_l70fejx2/ ├── .gitignore ├── CHANGELOG.md ├── LICENSE ├── Makefile ├── README.md ├── benchmarks/ │ ├── benchmark_gru.cc │ ├── benchmark_lstm.cc │ ├── cudnn_wrappers.h │ └── report.py ├── build/ │ ├── MANIFEST.in │ ├── common.py │ ├── setup.pytorch.py │ └── setup.tf.py ├── docs/ │ ├── pytorch/ │ │ ├── haste_pytorch/ │ │ │ ├── GRU.md │ │ │ ├── IndRNN.md │ │ │ ├── LSTM.md │ │ │ ├── LayerNormGRU.md │ │ │ └── LayerNormLSTM.md │ │ └── haste_pytorch.md │ └── tf/ │ ├── haste_tf/ │ │ ├── GRU.md │ │ ├── GRUCell.md │ │ ├── IndRNN.md │ │ ├── LSTM.md │ │ ├── LayerNorm.md │ │ ├── LayerNormGRU.md │ │ ├── LayerNormGRUCell.md │ │ ├── LayerNormLSTM.md │ │ ├── LayerNormLSTMCell.md │ │ └── ZoneoutWrapper.md │ └── haste_tf.md ├── examples/ │ ├── device_ptr.h │ ├── gru.cc │ └── lstm.cc ├── frameworks/ │ ├── pytorch/ │ │ ├── __init__.py │ │ ├── base_rnn.py │ │ ├── gru.cc │ │ ├── gru.py │ │ ├── indrnn.cc │ │ ├── indrnn.py │ │ ├── layer_norm_gru.cc │ │ ├── layer_norm_gru.py │ │ ├── layer_norm_indrnn.cc │ │ ├── layer_norm_indrnn.py │ │ ├── layer_norm_lstm.cc │ │ ├── layer_norm_lstm.py │ │ ├── lstm.cc │ │ ├── lstm.py │ │ ├── support.cc │ │ └── support.h │ └── tf/ │ ├── __init__.py │ ├── arena.h │ ├── base_rnn.py │ ├── gru.cc │ ├── gru.py │ ├── gru_cell.py │ ├── indrnn.cc │ ├── indrnn.py │ ├── layer_norm.cc │ ├── layer_norm.py │ ├── layer_norm_gru.cc │ ├── layer_norm_gru.py │ ├── layer_norm_gru_cell.py │ ├── layer_norm_indrnn.cc │ ├── layer_norm_indrnn.py │ ├── layer_norm_lstm.cc │ ├── layer_norm_lstm.py │ ├── layer_norm_lstm_cell.py │ ├── lstm.cc │ ├── lstm.py │ ├── support.cc │ ├── support.h │ ├── weight_config.py │ └── zoneout_wrapper.py ├── lib/ │ ├── blas.h │ ├── device_assert.h │ ├── gru_backward_gpu.cu.cc │ ├── gru_forward_gpu.cu.cc │ ├── haste/ │ │ ├── gru.h │ │ ├── indrnn.h │ │ ├── layer_norm.h │ │ ├── layer_norm_gru.h │ │ ├── layer_norm_indrnn.h │ │ ├── layer_norm_lstm.h │ │ └── lstm.h │ ├── haste.h │ ├── indrnn_backward_gpu.cu.cc │ ├── indrnn_forward_gpu.cu.cc │ ├── inline_ops.h │ ├── layer_norm_backward_gpu.cu.cc │ ├── layer_norm_forward_gpu.cu.cc │ ├── layer_norm_gru_backward_gpu.cu.cc │ ├── layer_norm_gru_forward_gpu.cu.cc │ ├── layer_norm_indrnn_backward_gpu.cu.cc │ ├── layer_norm_indrnn_forward_gpu.cu.cc │ ├── layer_norm_lstm_backward_gpu.cu.cc │ ├── layer_norm_lstm_forward_gpu.cu.cc │ ├── lstm_backward_gpu.cu.cc │ └── lstm_forward_gpu.cu.cc └── validation/ ├── pytorch.py ├── pytorch_speed.py ├── tf.py └── tf_pytorch.py ================================================ FILE CONTENTS ================================================ ================================================ FILE: .gitignore ================================================ *.a *.o *.so *.whl benchmark_lstm benchmark_gru haste_lstm haste_gru ================================================ FILE: CHANGELOG.md ================================================ # ChangeLog ## 0.4.0 (2020-04-13) ### Added - New layer normalized GRU layer (`LayerNormGRU`). - New IndRNN layer. - CPU support for all PyTorch layers. - Support for building PyTorch API on Windows. - Added `state` argument to PyTorch layers to specify initial state. - Added weight transforms to TensorFlow API (see docs for details). - Added `get_weights` method to extract weights from RNN layers (TensorFlow). - Added `to_native_weights` and `from_native_weights` to PyTorch API for `LSTM` and `GRU` layers. - Validation tests to check for correctness. ### Changed - Performance improvements to GRU layer. - BREAKING CHANGE: PyTorch layers default to CPU instead of GPU. - BREAKING CHANGE: `h` must not be transposed before passing it to `gru::BackwardPass::Iterate`. ### Fixed - Multi-GPU training with TensorFlow caused by invalid sharing of `cublasHandle_t`. ## 0.3.0 (2020-03-09) ### Added - PyTorch support. - New layer normalized LSTM layer (`LayerNormLSTM`). - New fused layer normalization layer. ### Fixed - Occasional uninitialized memory use in TensorFlow LSTM implementation. ## 0.2.0 (2020-02-12) ### Added - New time-fused API for LSTM (`lstm::ForwardPass::Run`, `lstm::BackwardPass::Run`). - Benchmarking code to evaluate the performance of an implementation. ### Changed - Performance improvements to existing iterative LSTM API. - BREAKING CHANGE: `h` must not be transposed before passing it to `lstm::BackwardPass::Iterate`. - BREAKING CHANGE: `dv` does not need to be allocated and `v` must be passed instead to `lstm::BackwardPass::Iterate`. ## 0.1.0 (2020-01-29) ### Added - Initial release of Haste. ================================================ FILE: LICENSE ================================================ Apache License Version 2.0, January 2004 http://www.apache.org/licenses/ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 1. Definitions. "License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document. "Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License. "Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity. "You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License. "Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files. "Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types. "Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below). "Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof. "Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution." "Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work. 2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form. 3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed. 4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions: (a) You must give any other recipients of the Work or Derivative Works a copy of this License; and (b) You must cause any modified files to carry prominent notices stating that You changed the files; and (c) You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and (d) If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License. You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License. 5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions. 6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file. 7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License. 8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages. 9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability. END OF TERMS AND CONDITIONS APPENDIX: How to apply the Apache License to your work. To apply the Apache License to your work, attach the following boilerplate notice, with the fields enclosed by brackets "[]" replaced with your own identifying information. (Don't include the brackets!) The text should be enclosed in the appropriate comment syntax for the file format. We also recommend that a file or class name and description of purpose be included on the same "printed page" as the copyright notice for easier identification within third-party archives. Copyright 2020 LMNT, Inc. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. ================================================ FILE: Makefile ================================================ AR ?= ar CXX ?= g++ NVCC ?= nvcc -ccbin $(CXX) PYTHON ?= python ifeq ($(OS),Windows_NT) LIBHASTE := haste.lib CUDA_HOME ?= $(CUDA_PATH) AR := lib AR_FLAGS := /nologo /out:$(LIBHASTE) NVCC_FLAGS := -x cu -Xcompiler "/MD" else LIBHASTE := libhaste.a CUDA_HOME ?= /usr/local/cuda AR ?= ar AR_FLAGS := -crv $(LIBHASTE) NVCC_FLAGS := -std=c++11 -x cu -Xcompiler -fPIC endif LOCAL_CFLAGS := -I/usr/include/eigen3 -I$(CUDA_HOME)/include -Ilib -O3 LOCAL_LDFLAGS := -L$(CUDA_HOME)/lib64 -L. -lcudart -lcublas GPU_ARCH_FLAGS := -gencode arch=compute_37,code=compute_37 -gencode arch=compute_60,code=compute_60 -gencode arch=compute_70,code=compute_70 # Small enough project that we can just recompile all the time. .PHONY: all haste haste_tf haste_pytorch libhaste_tf examples benchmarks clean all: haste haste_tf haste_pytorch examples benchmarks haste: $(NVCC) $(GPU_ARCH_FLAGS) -c lib/lstm_forward_gpu.cu.cc -o lib/lstm_forward_gpu.o $(NVCC_FLAGS) $(LOCAL_CFLAGS) $(NVCC) $(GPU_ARCH_FLAGS) -c lib/lstm_backward_gpu.cu.cc -o lib/lstm_backward_gpu.o $(NVCC_FLAGS) $(LOCAL_CFLAGS) $(NVCC) $(GPU_ARCH_FLAGS) -c lib/gru_forward_gpu.cu.cc -o lib/gru_forward_gpu.o $(NVCC_FLAGS) $(LOCAL_CFLAGS) $(NVCC) $(GPU_ARCH_FLAGS) -c lib/gru_backward_gpu.cu.cc -o lib/gru_backward_gpu.o $(NVCC_FLAGS) $(LOCAL_CFLAGS) $(NVCC) $(GPU_ARCH_FLAGS) -c lib/layer_norm_forward_gpu.cu.cc -o lib/layer_norm_forward_gpu.o $(NVCC_FLAGS) $(LOCAL_CFLAGS) $(NVCC) $(GPU_ARCH_FLAGS) -c lib/layer_norm_backward_gpu.cu.cc -o lib/layer_norm_backward_gpu.o $(NVCC_FLAGS) $(LOCAL_CFLAGS) $(NVCC) $(GPU_ARCH_FLAGS) -c lib/layer_norm_lstm_forward_gpu.cu.cc -o lib/layer_norm_lstm_forward_gpu.o $(NVCC_FLAGS) $(LOCAL_CFLAGS) $(NVCC) $(GPU_ARCH_FLAGS) -c lib/layer_norm_lstm_backward_gpu.cu.cc -o lib/layer_norm_lstm_backward_gpu.o $(NVCC_FLAGS) $(LOCAL_CFLAGS) $(NVCC) $(GPU_ARCH_FLAGS) -c lib/layer_norm_gru_forward_gpu.cu.cc -o lib/layer_norm_gru_forward_gpu.o $(NVCC_FLAGS) $(LOCAL_CFLAGS) $(NVCC) $(GPU_ARCH_FLAGS) -c lib/layer_norm_gru_backward_gpu.cu.cc -o lib/layer_norm_gru_backward_gpu.o $(NVCC_FLAGS) $(LOCAL_CFLAGS) $(NVCC) $(GPU_ARCH_FLAGS) -c lib/indrnn_backward_gpu.cu.cc -o lib/indrnn_backward_gpu.o $(NVCC_FLAGS) $(LOCAL_CFLAGS) $(NVCC) $(GPU_ARCH_FLAGS) -c lib/indrnn_forward_gpu.cu.cc -o lib/indrnn_forward_gpu.o $(NVCC_FLAGS) $(LOCAL_CFLAGS) $(NVCC) $(GPU_ARCH_FLAGS) -c lib/layer_norm_indrnn_forward_gpu.cu.cc -o lib/layer_norm_indrnn_forward_gpu.o $(NVCC_FLAGS) $(LOCAL_CFLAGS) $(NVCC) $(GPU_ARCH_FLAGS) -c lib/layer_norm_indrnn_backward_gpu.cu.cc -o lib/layer_norm_indrnn_backward_gpu.o $(NVCC_FLAGS) $(LOCAL_CFLAGS) $(AR) $(AR_FLAGS) lib/*.o libhaste_tf: haste $(eval TF_CFLAGS := $(shell $(PYTHON) -c 'import tensorflow as tf; print(" ".join(tf.sysconfig.get_compile_flags()))')) $(eval TF_LDFLAGS := $(shell $(PYTHON) -c 'import tensorflow as tf; print(" ".join(tf.sysconfig.get_link_flags()))')) $(CXX) -std=c++11 -c frameworks/tf/lstm.cc -o frameworks/tf/lstm.o $(LOCAL_CFLAGS) $(TF_CFLAGS) -fPIC $(CXX) -std=c++11 -c frameworks/tf/gru.cc -o frameworks/tf/gru.o $(LOCAL_CFLAGS) $(TF_CFLAGS) -fPIC $(CXX) -std=c++11 -c frameworks/tf/layer_norm.cc -o frameworks/tf/layer_norm.o $(LOCAL_CFLAGS) $(TF_CFLAGS) -fPIC $(CXX) -std=c++11 -c frameworks/tf/layer_norm_gru.cc -o frameworks/tf/layer_norm_gru.o $(LOCAL_CFLAGS) $(TF_CFLAGS) -fPIC $(CXX) -std=c++11 -c frameworks/tf/layer_norm_indrnn.cc -o frameworks/tf/layer_norm_indrnn.o $(LOCAL_CFLAGS) $(TF_CFLAGS) -fPIC $(CXX) -std=c++11 -c frameworks/tf/layer_norm_lstm.cc -o frameworks/tf/layer_norm_lstm.o $(LOCAL_CFLAGS) $(TF_CFLAGS) -fPIC $(CXX) -std=c++11 -c frameworks/tf/indrnn.cc -o frameworks/tf/indrnn.o $(LOCAL_CFLAGS) $(TF_CFLAGS) -fPIC $(CXX) -std=c++11 -c frameworks/tf/support.cc -o frameworks/tf/support.o $(LOCAL_CFLAGS) $(TF_CFLAGS) -fPIC $(CXX) -shared frameworks/tf/*.o libhaste.a -o frameworks/tf/libhaste_tf.so $(LOCAL_LDFLAGS) $(TF_LDFLAGS) -fPIC # Dependencies handled by setup.py haste_tf: @$(eval TMP := $(shell mktemp -d)) @cp -r . $(TMP) @cat build/common.py build/setup.tf.py > $(TMP)/setup.py @(cd $(TMP); $(PYTHON) setup.py -q bdist_wheel) @cp $(TMP)/dist/*.whl . @rm -rf $(TMP) # Dependencies handled by setup.py haste_pytorch: @$(eval TMP := $(shell mktemp -d)) @cp -r . $(TMP) @cat build/common.py build/setup.pytorch.py > $(TMP)/setup.py @(cd $(TMP); $(PYTHON) setup.py -q bdist_wheel) @cp $(TMP)/dist/*.whl . @rm -rf $(TMP) dist: @$(eval TMP := $(shell mktemp -d)) @cp -r . $(TMP) @cp build/MANIFEST.in $(TMP) @cat build/common.py build/setup.tf.py > $(TMP)/setup.py @(cd $(TMP); $(PYTHON) setup.py -q sdist) @cp $(TMP)/dist/*.tar.gz . @rm -rf $(TMP) @$(eval TMP := $(shell mktemp -d)) @cp -r . $(TMP) @cp build/MANIFEST.in $(TMP) @cat build/common.py build/setup.pytorch.py > $(TMP)/setup.py @(cd $(TMP); $(PYTHON) setup.py -q sdist) @cp $(TMP)/dist/*.tar.gz . @rm -rf $(TMP) examples: haste $(CXX) -std=c++11 examples/lstm.cc $(LIBHASTE) $(LOCAL_CFLAGS) $(LOCAL_LDFLAGS) -o haste_lstm -Wno-ignored-attributes $(CXX) -std=c++11 examples/gru.cc $(LIBHASTE) $(LOCAL_CFLAGS) $(LOCAL_LDFLAGS) -o haste_gru -Wno-ignored-attributes benchmarks: haste $(CXX) -std=c++11 benchmarks/benchmark_lstm.cc $(LIBHASTE) $(LOCAL_CFLAGS) $(LOCAL_LDFLAGS) -o benchmark_lstm -Wno-ignored-attributes -lcudnn $(CXX) -std=c++11 benchmarks/benchmark_gru.cc $(LIBHASTE) $(LOCAL_CFLAGS) $(LOCAL_LDFLAGS) -o benchmark_gru -Wno-ignored-attributes -lcudnn clean: rm -fr benchmark_lstm benchmark_gru haste_lstm haste_gru haste_*.whl haste_*.tar.gz find . \( -iname '*.o' -o -iname '*.so' -o -iname '*.a' -o -iname '*.lib' \) -delete ================================================ FILE: README.md ================================================
-------------------------------------------------------------------------------- [![GitHub release (latest SemVer including pre-releases)](https://img.shields.io/github/v/release/lmnt-com/haste?include_prereleases)](https://github.com/lmnt-com/haste/releases) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1hzYhcyvbXYMAUwa3515BszSkhx1UUFSt) [![GitHub](https://img.shields.io/github/license/lmnt-com/haste)](LICENSE) **We're hiring!** If you like what we're building here, [come join us at LMNT](https://explore.lmnt.com). Haste is a CUDA implementation of fused RNN layers with built-in [DropConnect](http://proceedings.mlr.press/v28/wan13.html) and [Zoneout](https://arxiv.org/abs/1606.01305) regularization. These layers are exposed through C++ and Python APIs for easy integration into your own projects or machine learning frameworks. Which RNN types are supported? - [GRU](https://en.wikipedia.org/wiki/Gated_recurrent_unit) - [IndRNN](http://arxiv.org/abs/1803.04831) - [LSTM](https://en.wikipedia.org/wiki/Long_short-term_memory) - [Layer Normalized GRU](https://arxiv.org/abs/1607.06450) - [Layer Normalized LSTM](https://arxiv.org/abs/1607.06450) What's included in this project? - a standalone C++ API (`libhaste`) - a TensorFlow Python API (`haste_tf`) - a PyTorch API (`haste_pytorch`) - examples for writing your own custom C++ inference / training code using `libhaste` - benchmarking programs to evaluate the performance of RNN implementations For questions or feedback about Haste, please open an issue on GitHub or send us an email at [haste@lmnt.com](mailto:haste@lmnt.com). ## Install Here's what you'll need to get started: - a [CUDA Compute Capability](https://developer.nvidia.com/cuda-gpus) 3.7+ GPU (required) - [CUDA Toolkit](https://developer.nvidia.com/cuda-toolkit) 10.0+ (required) - [TensorFlow GPU](https://www.tensorflow.org/install/gpu) 1.14+ or 2.0+ for TensorFlow integration (optional) - [PyTorch](https://pytorch.org) 1.3+ for PyTorch integration (optional) - [Eigen 3](http://eigen.tuxfamily.org/) to build the C++ examples (optional) - [cuDNN Developer Library](https://developer.nvidia.com/rdp/cudnn-archive) to build benchmarking programs (optional) Once you have the prerequisites, you can install with pip or by building the source code. ### Using pip ``` pip install haste_pytorch pip install haste_tf ``` ### Building from source ``` make # Build everything make haste # ;) Build C++ API make haste_tf # Build TensorFlow API make haste_pytorch # Build PyTorch API make examples make benchmarks ``` If you built the TensorFlow or PyTorch API, install it with `pip`: ``` pip install haste_tf-*.whl pip install haste_pytorch-*.whl ``` If the CUDA Toolkit that you're building against is not in `/usr/local/cuda`, you must specify the `$CUDA_HOME` environment variable before running make: ``` CUDA_HOME=/usr/local/cuda-10.2 make ``` ## Performance Our LSTM and GRU benchmarks indicate that Haste has the fastest publicly available implementation for nearly all problem sizes. The following charts show our LSTM results, but the GRU results are qualitatively similar.
Here is our complete LSTM benchmark result grid:
[`N=1 C=64`](https://lmnt.com/assets/haste/benchmark/report_n=1_c=64.png) [`N=1 C=128`](https://lmnt.com/assets/haste/benchmark/report_n=1_c=128.png) [`N=1 C=256`](https://lmnt.com/assets/haste/benchmark/report_n=1_c=256.png) [`N=1 C=512`](https://lmnt.com/assets/haste/benchmark/report_n=1_c=512.png)
[`N=32 C=64`](https://lmnt.com/assets/haste/benchmark/report_n=32_c=64.png) [`N=32 C=128`](https://lmnt.com/assets/haste/benchmark/report_n=32_c=128.png) [`N=32 C=256`](https://lmnt.com/assets/haste/benchmark/report_n=32_c=256.png) [`N=32 C=512`](https://lmnt.com/assets/haste/benchmark/report_n=32_c=512.png)
[`N=64 C=64`](https://lmnt.com/assets/haste/benchmark/report_n=64_c=64.png) [`N=64 C=128`](https://lmnt.com/assets/haste/benchmark/report_n=64_c=128.png) [`N=64 C=256`](https://lmnt.com/assets/haste/benchmark/report_n=64_c=256.png) [`N=64 C=512`](https://lmnt.com/assets/haste/benchmark/report_n=64_c=512.png)
[`N=128 C=64`](https://lmnt.com/assets/haste/benchmark/report_n=128_c=64.png) [`N=128 C=128`](https://lmnt.com/assets/haste/benchmark/report_n=128_c=128.png) [`N=128 C=256`](https://lmnt.com/assets/haste/benchmark/report_n=128_c=256.png) [`N=128 C=512`](https://lmnt.com/assets/haste/benchmark/report_n=128_c=512.png) ## Documentation ### TensorFlow API ```python import haste_tf as haste gru_layer = haste.GRU(num_units=256, direction='bidirectional', zoneout=0.1, dropout=0.05) indrnn_layer = haste.IndRNN(num_units=256, direction='bidirectional', zoneout=0.1) lstm_layer = haste.LSTM(num_units=256, direction='bidirectional', zoneout=0.1, dropout=0.05) norm_gru_layer = haste.LayerNormGRU(num_units=256, direction='bidirectional', zoneout=0.1, dropout=0.05) norm_lstm_layer = haste.LayerNormLSTM(num_units=256, direction='bidirectional', zoneout=0.1, dropout=0.05) # `x` is a tensor with shape [N,T,C] x = tf.random.normal([5, 25, 128]) y, state = gru_layer(x, training=True) y, state = indrnn_layer(x, training=True) y, state = lstm_layer(x, training=True) y, state = norm_gru_layer(x, training=True) y, state = norm_lstm_layer(x, training=True) ``` The TensorFlow Python API is documented in [`docs/tf/haste_tf.md`](docs/tf/haste_tf.md). ### PyTorch API ```python import torch import haste_pytorch as haste gru_layer = haste.GRU(input_size=128, hidden_size=256, zoneout=0.1, dropout=0.05) indrnn_layer = haste.IndRNN(input_size=128, hidden_size=256, zoneout=0.1) lstm_layer = haste.LSTM(input_size=128, hidden_size=256, zoneout=0.1, dropout=0.05) norm_gru_layer = haste.LayerNormGRU(input_size=128, hidden_size=256, zoneout=0.1, dropout=0.05) norm_lstm_layer = haste.LayerNormLSTM(input_size=128, hidden_size=256, zoneout=0.1, dropout=0.05) gru_layer.cuda() indrnn_layer.cuda() lstm_layer.cuda() norm_gru_layer.cuda() norm_lstm_layer.cuda() # `x` is a CUDA tensor with shape [T,N,C] x = torch.rand([25, 5, 128]).cuda() y, state = gru_layer(x) y, state = indrnn_layer(x) y, state = lstm_layer(x) y, state = norm_gru_layer(x) y, state = norm_lstm_layer(x) ``` The PyTorch API is documented in [`docs/pytorch/haste_pytorch.md`](docs/pytorch/haste_pytorch.md). ### C++ API The C++ API is documented in [`lib/haste/*.h`](lib/haste/) and there are code samples in [`examples/`](examples/). ## Code layout - [`benchmarks/`](benchmarks): programs to evaluate performance of RNN implementations - [`docs/tf/`](docs/tf): API reference documentation for `haste_tf` - [`docs/pytorch/`](docs/pytorch): API reference documentation for `haste_pytorch` - [`examples/`](examples): examples for writing your own C++ inference / training code using `libhaste` - [`frameworks/tf/`](frameworks/tf): TensorFlow Python API and custom op code - [`frameworks/pytorch/`](frameworks/pytorch): PyTorch API and custom op code - [`lib/`](lib): CUDA kernels and C++ API - [`validation/`](validation): scripts to validate output and gradients of RNN layers ## Implementation notes - the GRU implementation is based on `1406.1078v1` (same as cuDNN) rather than `1406.1078v3` - Zoneout on LSTM cells is applied to the hidden state only, and not the cell state - the layer normalized LSTM implementation uses [these equations](https://github.com/lmnt-com/haste/issues/1) ## References 1. Hochreiter, S., & Schmidhuber, J. (1997). Long Short-Term Memory. _Neural Computation_, _9_(8), 1735–1780. https://doi.org/10.1162/neco.1997.9.8.1735 1. Cho, K., van Merrienboer, B., Gulcehre, C., Bahdanau, D., Bougares, F., Schwenk, H., & Bengio, Y. (2014). Learning Phrase Representations using RNN Encoder-Decoder for Statistical Machine Translation. _arXiv:1406.1078 [cs, stat]_. http://arxiv.org/abs/1406.1078. 1. Wan, L., Zeiler, M., Zhang, S., Cun, Y. L., & Fergus, R. (2013). Regularization of Neural Networks using DropConnect. In _International Conference on Machine Learning_ (pp. 1058–1066). Presented at the International Conference on Machine Learning. http://proceedings.mlr.press/v28/wan13.html. 1. Krueger, D., Maharaj, T., Kramár, J., Pezeshki, M., Ballas, N., Ke, N. R., et al. (2017). Zoneout: Regularizing RNNs by Randomly Preserving Hidden Activations. _arXiv:1606.01305 [cs]_. http://arxiv.org/abs/1606.01305. 1. Ba, J., Kiros, J.R., & Hinton, G.E. (2016). Layer Normalization. _arXiv:1607.06450 [cs, stat]_. https://arxiv.org/abs/1607.06450. 1. Li, S., Li, W., Cook, C., Zhu, C., & Gao, Y. (2018). Independently Recurrent Neural Network (IndRNN): Building A Longer and Deeper RNN. _arXiv:1803.04831 [cs]_. http://arxiv.org/abs/1803.04831. ## Citing this work To cite this work, please use the following BibTeX entry: ``` @misc{haste2020, title = {Haste: a fast, simple, and open RNN library}, author = {Sharvil Nanavati}, year = 2020, month = "Jan", howpublished = {\url{https://github.com/lmnt-com/haste/}}, } ``` ## License [Apache 2.0](LICENSE) ================================================ FILE: benchmarks/benchmark_gru.cc ================================================ // Copyright 2020 LMNT, Inc. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // ============================================================================== #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "../examples/device_ptr.h" #include "cudnn_wrappers.h" #include "haste.h" using haste::v0::gru::BackwardPass; using haste::v0::gru::ForwardPass; using std::string; using Tensor1 = Eigen::Tensor; using Tensor2 = Eigen::Tensor; using Tensor3 = Eigen::Tensor; static constexpr int DEFAULT_SAMPLE_SIZE = 10; static constexpr int DEFAULT_TIME_STEPS = 50; static cudnnHandle_t g_cudnn_handle; static cublasHandle_t g_blas_handle; float TimeLoop(std::function fn, int iterations) { cudaEvent_t start, stop; cudaEventCreate(&start); cudaEventCreate(&stop); cudaEventRecord(start); for (int i = 0; i < iterations; ++i) fn(); float elapsed_ms; cudaEventRecord(stop); cudaEventSynchronize(stop); cudaEventElapsedTime(&elapsed_ms, start, stop); cudaEventDestroy(start); cudaEventDestroy(stop); return elapsed_ms / iterations; } float CudnnInference( int sample_size, const Tensor2& W, const Tensor2& R, const Tensor1& bx, const Tensor1& br, const Tensor3& x) { const int time_steps = x.dimension(2); const int batch_size = x.dimension(1); const int input_size = x.dimension(0); const int hidden_size = R.dimension(1); device_ptr x_dev(x); device_ptr h_dev(batch_size * hidden_size); device_ptr c_dev(batch_size * hidden_size); device_ptr y_dev(time_steps * batch_size * hidden_size); device_ptr h_out_dev(batch_size * hidden_size); device_ptr c_out_dev(batch_size * hidden_size); h_dev.zero(); c_dev.zero(); // Descriptors all the way down. Nice. RnnDescriptor rnn_descriptor(g_cudnn_handle, hidden_size, CUDNN_GRU); TensorDescriptorArray x_descriptors(time_steps, { batch_size, input_size, 1 }); TensorDescriptorArray y_descriptors(time_steps, { batch_size, hidden_size, 1 }); auto h_descriptor = TensorDescriptor({ 1, batch_size, hidden_size }); auto c_descriptor = TensorDescriptor({ 1, batch_size, hidden_size }); auto h_out_descriptor = TensorDescriptor({ 1, batch_size, hidden_size }); auto c_out_descriptor = TensorDescriptor({ 1, batch_size, hidden_size }); size_t workspace_size; cudnnGetRNNWorkspaceSize( g_cudnn_handle, *rnn_descriptor, time_steps, &x_descriptors, &workspace_size); auto workspace_dev = device_ptr::NewByteSized(workspace_size); size_t w_count; cudnnGetRNNParamsSize( g_cudnn_handle, *rnn_descriptor, *&x_descriptors, &w_count, CUDNN_DATA_FLOAT); auto w_dev = device_ptr::NewByteSized(w_count); FilterDescriptor w_descriptor(w_dev.Size()); float ms = TimeLoop([&]() { cudnnRNNForwardInference( g_cudnn_handle, *rnn_descriptor, time_steps, &x_descriptors, x_dev.data, *h_descriptor, h_dev.data, *c_descriptor, c_dev.data, *w_descriptor, w_dev.data, &y_descriptors, y_dev.data, *h_out_descriptor, h_out_dev.data, *c_out_descriptor, c_out_dev.data, workspace_dev.data, workspace_size); }, sample_size); return ms; } float CudnnTrain( int sample_size, const Tensor2& W, const Tensor2& R, const Tensor1& bx, const Tensor1& br, const Tensor3& x, const Tensor3& dh) { const int time_steps = x.dimension(2); const int batch_size = x.dimension(1); const int input_size = x.dimension(0); const int hidden_size = R.dimension(1); device_ptr y_dev(time_steps * batch_size * hidden_size); device_ptr dy_dev(time_steps * batch_size * hidden_size); device_ptr dhy_dev(batch_size * hidden_size); device_ptr dcy_dev(batch_size * hidden_size); device_ptr hx_dev(batch_size * hidden_size); device_ptr cx_dev(batch_size * hidden_size); device_ptr dx_dev(time_steps * batch_size * input_size); device_ptr dhx_dev(batch_size * hidden_size); device_ptr dcx_dev(batch_size * hidden_size); RnnDescriptor rnn_descriptor(g_cudnn_handle, hidden_size, CUDNN_GRU); TensorDescriptorArray y_descriptors(time_steps, { batch_size, hidden_size, 1 }); TensorDescriptorArray dy_descriptors(time_steps, { batch_size, hidden_size, 1 }); TensorDescriptorArray dx_descriptors(time_steps, { batch_size, input_size, 1 }); TensorDescriptor dhy_descriptor({ 1, batch_size, hidden_size }); TensorDescriptor dcy_descriptor({ 1, batch_size, hidden_size }); TensorDescriptor hx_descriptor({ 1, batch_size, hidden_size }); TensorDescriptor cx_descriptor({ 1, batch_size, hidden_size }); TensorDescriptor dhx_descriptor({ 1, batch_size, hidden_size }); TensorDescriptor dcx_descriptor({ 1, batch_size, hidden_size }); size_t workspace_size = 0; cudnnGetRNNWorkspaceSize( g_cudnn_handle, *rnn_descriptor, time_steps, &dx_descriptors, &workspace_size); auto workspace_dev = device_ptr::NewByteSized(workspace_size); size_t w_count = 0; cudnnGetRNNParamsSize( g_cudnn_handle, *rnn_descriptor, *&dx_descriptors, &w_count, CUDNN_DATA_FLOAT); auto w_dev = device_ptr::NewByteSized(w_count); FilterDescriptor w_descriptor(w_dev.Size()); size_t reserve_size = 0; cudnnGetRNNTrainingReserveSize( g_cudnn_handle, *rnn_descriptor, time_steps, &dx_descriptors, &reserve_size); auto reserve_dev = device_ptr::NewByteSized(reserve_size); float ms = TimeLoop([&]() { cudnnRNNForwardTraining( g_cudnn_handle, *rnn_descriptor, time_steps, &dx_descriptors, dx_dev.data, *hx_descriptor, hx_dev.data, *cx_descriptor, cx_dev.data, *w_descriptor, w_dev.data, &y_descriptors, y_dev.data, *dhy_descriptor, dhy_dev.data, *dcy_descriptor, dcy_dev.data, workspace_dev.data, workspace_size, reserve_dev.data, reserve_size); cudnnRNNBackwardData( g_cudnn_handle, *rnn_descriptor, time_steps, &y_descriptors, y_dev.data, &dy_descriptors, dy_dev.data, *dhy_descriptor, dhy_dev.data, *dcy_descriptor, dcy_dev.data, *w_descriptor, w_dev.data, *hx_descriptor, hx_dev.data, *cx_descriptor, cx_dev.data, &dx_descriptors, dx_dev.data, *dhx_descriptor, dhx_dev.data, *dcx_descriptor, dcx_dev.data, workspace_dev.data, workspace_size, reserve_dev.data, reserve_size); cudnnRNNBackwardWeights( g_cudnn_handle, *rnn_descriptor, time_steps, &dx_descriptors, dx_dev.data, *hx_descriptor, hx_dev.data, &y_descriptors, y_dev.data, workspace_dev.data, workspace_size, *w_descriptor, w_dev.data, reserve_dev.data, reserve_size); }, sample_size); return ms; } float HasteInference( int sample_size, const Tensor2& W, const Tensor2& R, const Tensor1& bx, const Tensor1& br, const Tensor3& x) { const int time_steps = x.dimension(2); const int batch_size = x.dimension(1); const int input_size = x.dimension(0); const int hidden_size = R.dimension(1); // Copy weights over to GPU. device_ptr W_dev(W); device_ptr R_dev(R); device_ptr bx_dev(bx); device_ptr br_dev(br); device_ptr x_dev(x); device_ptr h_dev((time_steps + 1) * batch_size * hidden_size); device_ptr tmp_Wx_dev(time_steps * batch_size * hidden_size * 3); device_ptr tmp_Rh_dev(batch_size * hidden_size * 3); h_dev.zero(); // Settle down the GPU and off we go! cudaDeviceSynchronize(); float ms = TimeLoop([&]() { ForwardPass forward( false, batch_size, input_size, hidden_size, g_blas_handle); forward.Run( time_steps, W_dev.data, R_dev.data, bx_dev.data, br_dev.data, x_dev.data, h_dev.data, nullptr, tmp_Wx_dev.data, tmp_Rh_dev.data, 0.0f, nullptr); }, sample_size); return ms; } float HasteTrain( int sample_size, const Tensor2& W, const Tensor2& R, const Tensor1& bx, const Tensor1& br, const Tensor3& x, const Tensor3& dh) { const int time_steps = x.dimension(2); const int batch_size = x.dimension(1); const int input_size = x.dimension(0); const int hidden_size = R.dimension(1); device_ptr W_dev(W); device_ptr R_dev(R); device_ptr x_dev(x); device_ptr h_dev((time_steps + 1) * batch_size * hidden_size); device_ptr v_dev(time_steps * batch_size * hidden_size * 4); device_ptr tmp_Wx_dev(time_steps * batch_size * hidden_size * 3); device_ptr tmp_Rh_dev(batch_size * hidden_size * 3); device_ptr W_t_dev(W); device_ptr R_t_dev(R); device_ptr bx_dev(bx); device_ptr br_dev(br); device_ptr x_t_dev(x); // These gradients should actually come "from above" but we're just allocating // a bunch of uninitialized memory and passing it in. device_ptr dh_new_dev(dh); device_ptr dx_dev(time_steps * batch_size * input_size); device_ptr dW_dev(input_size * hidden_size * 3); device_ptr dR_dev(hidden_size * hidden_size * 3); device_ptr dbx_dev(hidden_size * 3); device_ptr dbr_dev(hidden_size * 3); device_ptr dh_dev(batch_size * hidden_size); device_ptr dp_dev(time_steps * batch_size * hidden_size * 3); device_ptr dq_dev(time_steps * batch_size * hidden_size * 3); ForwardPass forward( true, batch_size, input_size, hidden_size, g_blas_handle); BackwardPass backward( batch_size, input_size, hidden_size, g_blas_handle); static const float alpha = 1.0f; static const float beta = 0.0f; cudaDeviceSynchronize(); float ms = TimeLoop([&]() { forward.Run( time_steps, W_dev.data, R_dev.data, bx_dev.data, br_dev.data, x_dev.data, h_dev.data, v_dev.data, tmp_Wx_dev.data, tmp_Rh_dev.data, 0.0f, nullptr); // Haste needs `x`, `W`, and `R` to be transposed between the forward // pass and backward pass. Add these transposes in here to get a fair // measurement of the overall time it takes to run an entire training // loop. cublasSgeam( g_blas_handle, CUBLAS_OP_T, CUBLAS_OP_N, batch_size * time_steps, input_size, &alpha, x_dev.data, input_size, &beta, x_dev.data, batch_size * time_steps, x_t_dev.data, batch_size * time_steps); cublasSgeam( g_blas_handle, CUBLAS_OP_T, CUBLAS_OP_N, input_size, hidden_size * 3, &alpha, W_dev.data, hidden_size * 3, &beta, W_dev.data, input_size, W_t_dev.data, input_size); cublasSgeam( g_blas_handle, CUBLAS_OP_T, CUBLAS_OP_N, hidden_size, hidden_size * 3, &alpha, R_dev.data, hidden_size * 3, &beta, R_dev.data, hidden_size, R_t_dev.data, hidden_size); backward.Run( time_steps, W_t_dev.data, R_t_dev.data, bx_dev.data, br_dev.data, x_t_dev.data, h_dev.data, v_dev.data, dh_new_dev.data, dx_dev.data, dW_dev.data, dR_dev.data, dbx_dev.data, dbr_dev.data, dh_dev.data, dp_dev.data, dq_dev.data, nullptr); }, sample_size); return ms; } void usage(const char* name) { printf("Usage: %s [OPTION]...\n", name); printf(" -h, --help\n"); printf(" -i, --implementation IMPL (default: haste)\n"); printf(" -m, --mode MODE (default: training)\n"); printf(" -s, --sample_size NUM number of runs to average over (default: %d)\n", DEFAULT_SAMPLE_SIZE); printf(" -t, --time_steps NUM number of time steps in RNN (default: %d)\n", DEFAULT_TIME_STEPS); } int main(int argc, char* const* argv) { srand(time(0)); cudnnCreate(&g_cudnn_handle); cublasCreate(&g_blas_handle); static struct option long_options[] = { { "help", no_argument, 0, 'h' }, { "implementation", required_argument, 0, 'i' }, { "mode", required_argument, 0, 'm' }, { "sample_size", required_argument, 0, 's' }, { "time_steps", required_argument, 0, 't' }, { 0, 0, 0, 0 } }; int c; int opt_index; bool inference_flag = false; bool haste_flag = true; int sample_size = DEFAULT_SAMPLE_SIZE; int time_steps = DEFAULT_TIME_STEPS; while ((c = getopt_long(argc, argv, "hi:m:s:t:", long_options, &opt_index)) != -1) switch (c) { case 'h': usage(argv[0]); return 0; case 'i': if (optarg[0] == 'c' || optarg[0] == 'C') haste_flag = false; break; case 'm': if (optarg[0] == 'i' || optarg[0] == 'I') inference_flag = true; break; case 's': sscanf(optarg, "%d", &sample_size); break; case 't': sscanf(optarg, "%d", &time_steps); break; } printf("# Benchmark configuration:\n"); printf("# Mode: %s\n", inference_flag ? "inference" : "training"); printf("# Implementation: %s\n", haste_flag ? "Haste" : "cuDNN"); printf("# Sample size: %d\n", sample_size); printf("# Time steps: %d\n", time_steps); printf("#\n"); printf("# batch_size,hidden_size,input_size,time_ms\n"); for (const int N : { 1, 16, 32, 64, 128 }) { for (const int H : { 128, 256, 512, 768, 1024, 1536, 2048, 3072, 4096 }) { for (const int C : { 64, 128, 256, 512 }) { Tensor2 W(H * 3, C); Tensor2 R(H * 3, H); Tensor1 bx(H * 3); Tensor1 br(H * 3); Tensor3 x(C, N, time_steps); Tensor3 dh(H, N, time_steps + 1); float ms; if (inference_flag) { if (haste_flag) ms = HasteInference(sample_size, W, R, bx, br, x); else ms = CudnnInference(sample_size, W, R, bx, br, x); } else { if (haste_flag) ms = HasteTrain(sample_size, W, R, bx, br, x, dh); else ms = CudnnTrain(sample_size, W, R, bx, br, x, dh); } printf("%d,%d,%d,%f\n", N, H, C, ms); } } } cublasDestroy(g_blas_handle); cudnnDestroy(g_cudnn_handle); return 0; } ================================================ FILE: benchmarks/benchmark_lstm.cc ================================================ // Copyright 2020 LMNT, Inc. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // ============================================================================== #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "../examples/device_ptr.h" #include "cudnn_wrappers.h" #include "haste.h" using haste::v0::lstm::BackwardPass; using haste::v0::lstm::ForwardPass; using std::string; using Tensor1 = Eigen::Tensor; using Tensor2 = Eigen::Tensor; using Tensor3 = Eigen::Tensor; static constexpr int DEFAULT_SAMPLE_SIZE = 10; static constexpr int DEFAULT_TIME_STEPS = 50; static cudnnHandle_t g_cudnn_handle; static cublasHandle_t g_blas_handle; float TimeLoop(std::function fn, int iterations) { cudaEvent_t start, stop; cudaEventCreate(&start); cudaEventCreate(&stop); cudaEventRecord(start); for (int i = 0; i < iterations; ++i) fn(); float elapsed_ms; cudaEventRecord(stop); cudaEventSynchronize(stop); cudaEventElapsedTime(&elapsed_ms, start, stop); cudaEventDestroy(start); cudaEventDestroy(stop); return elapsed_ms / iterations; } float CudnnInference( int sample_size, const Tensor2& W, const Tensor2& R, const Tensor1& b, const Tensor3& x) { const int time_steps = x.dimension(2); const int batch_size = x.dimension(1); const int input_size = x.dimension(0); const int hidden_size = R.dimension(1); device_ptr x_dev(x); device_ptr h_dev(batch_size * hidden_size); device_ptr c_dev(batch_size * hidden_size); device_ptr y_dev(time_steps * batch_size * hidden_size); device_ptr h_out_dev(batch_size * hidden_size); device_ptr c_out_dev(batch_size * hidden_size); h_dev.zero(); c_dev.zero(); // Descriptors all the way down. Nice. RnnDescriptor rnn_descriptor(g_cudnn_handle, hidden_size, CUDNN_LSTM); TensorDescriptorArray x_descriptors(time_steps, { batch_size, input_size, 1 }); TensorDescriptorArray y_descriptors(time_steps, { batch_size, hidden_size, 1 }); auto h_descriptor = TensorDescriptor({ 1, batch_size, hidden_size }); auto c_descriptor = TensorDescriptor({ 1, batch_size, hidden_size }); auto h_out_descriptor = TensorDescriptor({ 1, batch_size, hidden_size }); auto c_out_descriptor = TensorDescriptor({ 1, batch_size, hidden_size }); size_t workspace_size; cudnnGetRNNWorkspaceSize( g_cudnn_handle, *rnn_descriptor, time_steps, &x_descriptors, &workspace_size); auto workspace_dev = device_ptr::NewByteSized(workspace_size); size_t w_count; cudnnGetRNNParamsSize( g_cudnn_handle, *rnn_descriptor, *&x_descriptors, &w_count, CUDNN_DATA_FLOAT); auto w_dev = device_ptr::NewByteSized(w_count); FilterDescriptor w_descriptor(w_dev.Size()); float ms = TimeLoop([&]() { cudnnRNNForwardInference( g_cudnn_handle, *rnn_descriptor, time_steps, &x_descriptors, x_dev.data, *h_descriptor, h_dev.data, *c_descriptor, c_dev.data, *w_descriptor, w_dev.data, &y_descriptors, y_dev.data, *h_out_descriptor, h_out_dev.data, *c_out_descriptor, c_out_dev.data, workspace_dev.data, workspace_size); }, sample_size); return ms; } float CudnnTrain( int sample_size, const Tensor2& W, const Tensor2& R, const Tensor1& b, const Tensor3& x, const Tensor3& dh, const Tensor3& dc) { const int time_steps = x.dimension(2); const int batch_size = x.dimension(1); const int input_size = x.dimension(0); const int hidden_size = R.dimension(1); device_ptr y_dev(time_steps * batch_size * hidden_size); device_ptr dy_dev(time_steps * batch_size * hidden_size); device_ptr dhy_dev(batch_size * hidden_size); device_ptr dcy_dev(batch_size * hidden_size); device_ptr hx_dev(batch_size * hidden_size); device_ptr cx_dev(batch_size * hidden_size); device_ptr dx_dev(time_steps * batch_size * input_size); device_ptr dhx_dev(batch_size * hidden_size); device_ptr dcx_dev(batch_size * hidden_size); RnnDescriptor rnn_descriptor(g_cudnn_handle, hidden_size, CUDNN_LSTM); TensorDescriptorArray y_descriptors(time_steps, { batch_size, hidden_size, 1 }); TensorDescriptorArray dy_descriptors(time_steps, { batch_size, hidden_size, 1 }); TensorDescriptorArray dx_descriptors(time_steps, { batch_size, input_size, 1 }); TensorDescriptor dhy_descriptor({ 1, batch_size, hidden_size }); TensorDescriptor dcy_descriptor({ 1, batch_size, hidden_size }); TensorDescriptor hx_descriptor({ 1, batch_size, hidden_size }); TensorDescriptor cx_descriptor({ 1, batch_size, hidden_size }); TensorDescriptor dhx_descriptor({ 1, batch_size, hidden_size }); TensorDescriptor dcx_descriptor({ 1, batch_size, hidden_size }); size_t workspace_size = 0; cudnnGetRNNWorkspaceSize( g_cudnn_handle, *rnn_descriptor, time_steps, &dx_descriptors, &workspace_size); auto workspace_dev = device_ptr::NewByteSized(workspace_size); size_t w_count = 0; cudnnGetRNNParamsSize( g_cudnn_handle, *rnn_descriptor, *&dx_descriptors, &w_count, CUDNN_DATA_FLOAT); auto w_dev = device_ptr::NewByteSized(w_count); FilterDescriptor w_descriptor(w_dev.Size()); size_t reserve_size = 0; cudnnGetRNNTrainingReserveSize( g_cudnn_handle, *rnn_descriptor, time_steps, &dx_descriptors, &reserve_size); auto reserve_dev = device_ptr::NewByteSized(reserve_size); float ms = TimeLoop([&]() { cudnnRNNForwardTraining( g_cudnn_handle, *rnn_descriptor, time_steps, &dx_descriptors, dx_dev.data, *hx_descriptor, hx_dev.data, *cx_descriptor, cx_dev.data, *w_descriptor, w_dev.data, &y_descriptors, y_dev.data, *dhy_descriptor, dhy_dev.data, *dcy_descriptor, dcy_dev.data, workspace_dev.data, workspace_size, reserve_dev.data, reserve_size); cudnnRNNBackwardData( g_cudnn_handle, *rnn_descriptor, time_steps, &y_descriptors, y_dev.data, &dy_descriptors, dy_dev.data, *dhy_descriptor, dhy_dev.data, *dcy_descriptor, dcy_dev.data, *w_descriptor, w_dev.data, *hx_descriptor, hx_dev.data, *cx_descriptor, cx_dev.data, &dx_descriptors, dx_dev.data, *dhx_descriptor, dhx_dev.data, *dcx_descriptor, dcx_dev.data, workspace_dev.data, workspace_size, reserve_dev.data, reserve_size); cudnnRNNBackwardWeights( g_cudnn_handle, *rnn_descriptor, time_steps, &dx_descriptors, dx_dev.data, *hx_descriptor, hx_dev.data, &y_descriptors, y_dev.data, workspace_dev.data, workspace_size, *w_descriptor, w_dev.data, reserve_dev.data, reserve_size); }, sample_size); return ms; } float HasteInference( int sample_size, const Tensor2& W, const Tensor2& R, const Tensor1& b, const Tensor3& x) { const int time_steps = x.dimension(2); const int batch_size = x.dimension(1); const int input_size = x.dimension(0); const int hidden_size = R.dimension(1); // Copy weights over to GPU. device_ptr W_dev(W); device_ptr R_dev(R); device_ptr b_dev(b); device_ptr x_dev(x); device_ptr h_dev((time_steps + 1) * batch_size * hidden_size); device_ptr c_dev((time_steps + 1) * batch_size * hidden_size); device_ptr v_dev(time_steps * batch_size * hidden_size * 4); device_ptr tmp_Rh_dev(batch_size * hidden_size * 4); h_dev.zero(); c_dev.zero(); // Settle down the GPU and off we go! cudaDeviceSynchronize(); float ms = TimeLoop([&]() { ForwardPass forward( false, batch_size, input_size, hidden_size, g_blas_handle); forward.Run( time_steps, W_dev.data, R_dev.data, b_dev.data, x_dev.data, h_dev.data, c_dev.data, v_dev.data, tmp_Rh_dev.data, 0.0f, nullptr); }, sample_size); return ms; } float HasteTrain( int sample_size, const Tensor2& W, const Tensor2& R, const Tensor1& b, const Tensor3& x, const Tensor3& dh, const Tensor3& dc) { const int time_steps = x.dimension(2); const int batch_size = x.dimension(1); const int input_size = x.dimension(0); const int hidden_size = R.dimension(1); Eigen::array transpose_x({ 1, 2, 0 }); Tensor3 x_t = x.shuffle(transpose_x); Eigen::array transpose({ 1, 0 }); Tensor2 W_t = W.shuffle(transpose); Tensor2 R_t = R.shuffle(transpose); device_ptr W_dev(W); device_ptr R_dev(R); device_ptr x_dev(x); device_ptr h_dev((time_steps + 1) * batch_size * hidden_size); device_ptr c_dev((time_steps + 1) * batch_size * hidden_size); device_ptr v_dev(time_steps * batch_size * hidden_size * 4); device_ptr tmp_Rh_dev(batch_size * hidden_size * 4); device_ptr W_t_dev(W_t); device_ptr R_t_dev(R_t); device_ptr b_dev(b); device_ptr x_t_dev(x_t); // These gradients should actually come "from above" but we're just allocating // a bunch of uninitialized memory and passing it in. device_ptr dh_new_dev(dh); device_ptr dc_new_dev(dc); device_ptr dx_dev(time_steps * batch_size * input_size); device_ptr dW_dev(input_size * hidden_size * 4); device_ptr dR_dev(hidden_size * hidden_size * 4); device_ptr db_dev(hidden_size * 4); device_ptr dh_dev((time_steps + 1) * batch_size * hidden_size); device_ptr dc_dev((time_steps + 1) * batch_size * hidden_size); dW_dev.zero(); dR_dev.zero(); db_dev.zero(); dh_dev.zero(); dc_dev.zero(); ForwardPass forward( true, batch_size, input_size, hidden_size, g_blas_handle); BackwardPass backward( batch_size, input_size, hidden_size, g_blas_handle); static const float alpha = 1.0f; static const float beta = 0.0f; cudaDeviceSynchronize(); float ms = TimeLoop([&]() { forward.Run( time_steps, W_dev.data, R_dev.data, b_dev.data, x_dev.data, h_dev.data, c_dev.data, v_dev.data, tmp_Rh_dev.data, 0.0f, nullptr); // Haste needs `x`, `W`, and `R` to be transposed between the forward // pass and backward pass. Add these transposes in here to get a fair // measurement of the overall time it takes to run an entire training // loop. cublasSgeam( g_blas_handle, CUBLAS_OP_T, CUBLAS_OP_N, batch_size * time_steps, input_size, &alpha, x_dev.data, input_size, &beta, x_dev.data, batch_size * time_steps, x_t_dev.data, batch_size * time_steps); cublasSgeam( g_blas_handle, CUBLAS_OP_T, CUBLAS_OP_N, input_size, hidden_size * 4, &alpha, W_dev.data, hidden_size * 4, &beta, W_dev.data, input_size, W_t_dev.data, input_size); cublasSgeam( g_blas_handle, CUBLAS_OP_T, CUBLAS_OP_N, hidden_size, hidden_size * 4, &alpha, R_dev.data, hidden_size * 4, &beta, R_dev.data, hidden_size, R_t_dev.data, hidden_size); backward.Run( time_steps, W_t_dev.data, R_t_dev.data, b_dev.data, x_t_dev.data, h_dev.data, c_dev.data, dh_new_dev.data, dc_new_dev.data, dx_dev.data, dW_dev.data, dR_dev.data, db_dev.data, dh_dev.data, dc_dev.data, v_dev.data, nullptr); }, sample_size); return ms; } void usage(const char* name) { printf("Usage: %s [OPTION]...\n", name); printf(" -h, --help\n"); printf(" -i, --implementation IMPL (default: haste)\n"); printf(" -m, --mode MODE (default: training)\n"); printf(" -s, --sample_size NUM number of runs to average over (default: %d)\n", DEFAULT_SAMPLE_SIZE); printf(" -t, --time_steps NUM number of time steps in RNN (default: %d)\n", DEFAULT_TIME_STEPS); } int main(int argc, char* const* argv) { srand(time(0)); cudnnCreate(&g_cudnn_handle); cublasCreate(&g_blas_handle); static struct option long_options[] = { { "help", no_argument, 0, 'h' }, { "implementation", required_argument, 0, 'i' }, { "mode", required_argument, 0, 'm' }, { "sample_size", required_argument, 0, 's' }, { "time_steps", required_argument, 0, 't' }, { 0, 0, 0, 0 } }; int c; int opt_index; bool inference_flag = false; bool haste_flag = true; int sample_size = DEFAULT_SAMPLE_SIZE; int time_steps = DEFAULT_TIME_STEPS; while ((c = getopt_long(argc, argv, "hi:m:s:t:", long_options, &opt_index)) != -1) switch (c) { case 'h': usage(argv[0]); return 0; case 'i': if (optarg[0] == 'c' || optarg[0] == 'C') haste_flag = false; break; case 'm': if (optarg[0] == 'i' || optarg[0] == 'I') inference_flag = true; break; case 's': sscanf(optarg, "%d", &sample_size); break; case 't': sscanf(optarg, "%d", &time_steps); break; } printf("# Benchmark configuration:\n"); printf("# Mode: %s\n", inference_flag ? "inference" : "training"); printf("# Implementation: %s\n", haste_flag ? "Haste" : "cuDNN"); printf("# Sample size: %d\n", sample_size); printf("# Time steps: %d\n", time_steps); printf("#\n"); printf("# batch_size,hidden_size,input_size,time_ms\n"); for (const int N : { 1, 16, 32, 64, 128 }) { for (const int H : { 128, 256, 512, 768, 1024, 1536, 2048, 3072, 4096 }) { for (const int C : { 64, 128, 256, 512 }) { Tensor2 W(H * 4, C); Tensor2 R(H * 4, H); Tensor1 b(H * 4); Tensor3 x(C, N, time_steps); Tensor3 dh(H, N, time_steps + 1); Tensor3 dc(H, N, time_steps + 1); float ms; if (inference_flag) { if (haste_flag) ms = HasteInference(sample_size, W, R, b, x); else ms = CudnnInference(sample_size, W, R, b, x); } else { if (haste_flag) ms = HasteTrain(sample_size, W, R, b, x, dh, dc); else ms = CudnnTrain(sample_size, W, R, b, x, dh, dc); } printf("%d,%d,%d,%f\n", N, H, C, ms); } } } cublasDestroy(g_blas_handle); cudnnDestroy(g_cudnn_handle); return 0; } ================================================ FILE: benchmarks/cudnn_wrappers.h ================================================ // Copyright 2020 LMNT, Inc. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // ============================================================================== #pragma once #include #include #include template struct CudnnDataType {}; template<> struct CudnnDataType { static constexpr auto value = CUDNN_DATA_FLOAT; }; template<> struct CudnnDataType { static constexpr auto value = CUDNN_DATA_DOUBLE; }; template class TensorDescriptor { public: TensorDescriptor(const std::vector& dims) { std::vector strides; int stride = 1; for (int i = dims.size() - 1; i >= 0; --i) { strides.insert(strides.begin(), stride); stride *= dims[i]; } cudnnCreateTensorDescriptor(&descriptor_); cudnnSetTensorNdDescriptor(descriptor_, CudnnDataType::value, dims.size(), &dims[0], &strides[0]); } ~TensorDescriptor() { cudnnDestroyTensorDescriptor(descriptor_); } cudnnTensorDescriptor_t& operator*() { return descriptor_; } private: cudnnTensorDescriptor_t descriptor_; }; template class TensorDescriptorArray { public: TensorDescriptorArray(int count, const std::vector& dims) { std::vector strides; int stride = 1; for (int i = dims.size() - 1; i >= 0; --i) { strides.insert(strides.begin(), stride); stride *= dims[i]; } for (int i = 0; i < count; ++i) { cudnnTensorDescriptor_t descriptor; cudnnCreateTensorDescriptor(&descriptor); cudnnSetTensorNdDescriptor(descriptor, CudnnDataType::value, dims.size(), &dims[0], &strides[0]); descriptors_.push_back(descriptor); } } ~TensorDescriptorArray() { for (auto& desc : descriptors_) cudnnDestroyTensorDescriptor(desc); } cudnnTensorDescriptor_t* operator&() { return &descriptors_[0]; } private: std::vector descriptors_; }; class DropoutDescriptor { public: DropoutDescriptor(const cudnnHandle_t& handle) { cudnnCreateDropoutDescriptor(&descriptor_); cudnnSetDropoutDescriptor(descriptor_, handle, 0.0f, nullptr, 0, 0LL); } ~DropoutDescriptor() { cudnnDestroyDropoutDescriptor(descriptor_); } cudnnDropoutDescriptor_t& operator*() { return descriptor_; } private: cudnnDropoutDescriptor_t descriptor_; }; template class RnnDescriptor { public: RnnDescriptor(const cudnnHandle_t& handle, int size, cudnnRNNMode_t algorithm) : dropout_(handle) { cudnnCreateRNNDescriptor(&descriptor_); cudnnSetRNNDescriptor( handle, descriptor_, size, 1, *dropout_, CUDNN_LINEAR_INPUT, CUDNN_UNIDIRECTIONAL, algorithm, CUDNN_RNN_ALGO_STANDARD, CudnnDataType::value); } ~RnnDescriptor() { cudnnDestroyRNNDescriptor(descriptor_); } cudnnRNNDescriptor_t& operator*() { return descriptor_; } private: cudnnRNNDescriptor_t descriptor_; DropoutDescriptor dropout_; }; template class FilterDescriptor { public: FilterDescriptor(const size_t size) { int filter_dim[] = { (int)size, 1, 1 }; cudnnCreateFilterDescriptor(&descriptor_); cudnnSetFilterNdDescriptor(descriptor_, CudnnDataType::value, CUDNN_TENSOR_NCHW, 3, filter_dim); } ~FilterDescriptor() { cudnnDestroyFilterDescriptor(descriptor_); } cudnnFilterDescriptor_t& operator*() { return descriptor_; } private: cudnnFilterDescriptor_t descriptor_; }; ================================================ FILE: benchmarks/report.py ================================================ # Copyright 2020 LMNT, Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== import argparse import matplotlib.pyplot as plt import numpy as np import os def extract(x, predicate): return np.array(list(filter(predicate, x))) def main(args): np.set_printoptions(suppress=True) A = np.loadtxt(args.A, delimiter=',') B = np.loadtxt(args.B, delimiter=',') faster = 1.0 - A[:,-1] / B[:,-1] print(f'A is faster than B by:') print(f' mean: {np.mean(faster)*100:7.4}%') print(f' std: {np.std(faster)*100:7.4}%') print(f' median: {np.median(faster)*100:7.4}%') print(f' min: {np.min(faster)*100:7.4}%') print(f' max: {np.max(faster)*100:7.4}%') for batch_size in np.unique(A[:,0]): for input_size in np.unique(A[:,2]): a = extract(A, lambda x: x[0] == batch_size and x[2] == input_size) b = extract(B, lambda x: x[0] == batch_size and x[2] == input_size) fig, ax = plt.subplots(dpi=200) ax.set_xticks(a[:,1]) ax.set_xticklabels(a[:,1].astype(np.int32), rotation=60) ax.tick_params(axis='y', which='both', length=0) ax.spines['top'].set_visible(False) ax.spines['right'].set_visible(False) plt.title(f'batch size={int(batch_size)}, input size={int(input_size)}') plt.plot(a[:,1], a[:,-1], color=args.color[0]) plt.plot(a[:,1], b[:,-1], color=args.color[1]) plt.xlabel('hidden size') plt.ylabel('time (ms)') plt.legend(args.name, frameon=False) plt.tight_layout() if args.save: os.makedirs(args.save[0], exist_ok=True) plt.savefig(f'{args.save[0]}/report_n={int(batch_size)}_c={int(input_size)}.png', dpi=200) else: plt.show() if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--name', nargs=2, default=['A', 'B']) parser.add_argument('--color', nargs=2, default=['#1f77b4', '#2ca02c']) parser.add_argument('--save', nargs=1, default=None) parser.add_argument('A') parser.add_argument('B') main(parser.parse_args()) ================================================ FILE: build/MANIFEST.in ================================================ include Makefile include frameworks/tf/*.h include frameworks/tf/*.cc include frameworks/pytorch/*.h include frameworks/pytorch/*.cc include lib/*.cc include lib/*.h include lib/haste/*.h ================================================ FILE: build/common.py ================================================ # Copyright 2020 LMNT, Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== VERSION = '0.5.0-rc0' DESCRIPTION = 'Haste: a fast, simple, and open RNN library.' AUTHOR = 'LMNT, Inc.' AUTHOR_EMAIL = 'haste@lmnt.com' URL = 'https://haste.lmnt.com' LICENSE = 'Apache 2.0' CLASSIFIERS = [ 'Development Status :: 4 - Beta', 'Intended Audience :: Developers', 'Intended Audience :: Education', 'Intended Audience :: Science/Research', 'License :: OSI Approved :: Apache Software License', 'Programming Language :: Python :: 3.6', 'Programming Language :: Python :: 3.7', 'Programming Language :: Python :: 3.8', 'Topic :: Scientific/Engineering :: Mathematics', 'Topic :: Software Development :: Libraries :: Python Modules', 'Topic :: Software Development :: Libraries', ] ================================================ FILE: build/setup.pytorch.py ================================================ # Copyright 2020 LMNT, Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== import os import sys from glob import glob from platform import platform from torch.utils import cpp_extension from setuptools import setup from setuptools.dist import Distribution class BuildHaste(cpp_extension.BuildExtension): def run(self): os.system('make haste') super().run() base_path = os.path.dirname(os.path.realpath(__file__)) if 'Windows' in platform(): CUDA_HOME = os.environ.get('CUDA_HOME', os.environ.get('CUDA_PATH')) extra_args = [] else: CUDA_HOME = os.environ.get('CUDA_HOME', '/usr/local/cuda') extra_args = ['-Wno-sign-compare'] with open(f'frameworks/pytorch/_version.py', 'wt') as f: f.write(f'__version__ = "{VERSION}"') extension = cpp_extension.CUDAExtension( 'haste_pytorch_lib', sources = glob('frameworks/pytorch/*.cc'), extra_compile_args = extra_args, include_dirs = [os.path.join(base_path, 'lib'), os.path.join(CUDA_HOME, 'include')], libraries = ['haste'], library_dirs = ['.']) setup(name = 'haste_pytorch', version = VERSION, description = DESCRIPTION, long_description = open('README.md', 'r',encoding='utf-8').read(), long_description_content_type = 'text/markdown', author = AUTHOR, author_email = AUTHOR_EMAIL, url = URL, license = LICENSE, keywords = 'pytorch machine learning rnn lstm gru custom op', packages = ['haste_pytorch'], package_dir = { 'haste_pytorch': 'frameworks/pytorch' }, install_requires = [], ext_modules = [extension], cmdclass = { 'build_ext': BuildHaste }, classifiers = CLASSIFIERS) ================================================ FILE: build/setup.tf.py ================================================ # Copyright 2020 LMNT, Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== import os import sys from setuptools import setup from setuptools.dist import Distribution from distutils.command.build import build as _build class BinaryDistribution(Distribution): """This class is needed in order to create OS specific wheels.""" def has_ext_modules(self): return True class BuildHaste(_build): def run(self): os.system('make libhaste_tf') super().run() with open(f'frameworks/tf/_version.py', 'wt') as f: f.write(f'__version__ = "{VERSION}"') setup(name = 'haste_tf', version = VERSION, description = DESCRIPTION, long_description = open('README.md', 'r').read(), long_description_content_type = 'text/markdown', author = AUTHOR, author_email = AUTHOR_EMAIL, url = URL, license = LICENSE, keywords = 'tensorflow machine learning rnn lstm gru custom op', packages = ['haste_tf'], package_dir = { 'haste_tf': 'frameworks/tf' }, package_data = { 'haste_tf': ['*.so'] }, install_requires = [], zip_safe = False, distclass = BinaryDistribution, cmdclass = { 'build': BuildHaste }, classifiers = CLASSIFIERS) ================================================ FILE: docs/pytorch/haste_pytorch/GRU.md ================================================
# haste_pytorch.GRU ## Class `GRU` Gated Recurrent Unit layer. This GRU layer offers a fused, GPU-accelerated PyTorch op for inference and training. There are two commonly-used variants of GRU cells. This one implements 1406.1078v1 which applies the reset gate to the hidden state after matrix multiplication. cuDNN also implements this variant. The other variant, 1406.1078v3, applies the reset gate before matrix multiplication and is currently unsupported. This layer has built-in support for DropConnect and Zoneout, which are both techniques used to regularize RNNs. See [\_\_init\_\_](#__init__) and [forward](#forward) for usage. See [from_native_weights](#from_native_weights) and [to_native_weights](#to_native_weights) for compatibility with PyTorch GRUs.

__init__

``` python __init__( input_size, hidden_size, batch_first=False, dropout=0.0, zoneout=0.0 ) ``` Initialize the parameters of the GRU layer. #### Arguments: * `input_size`: int, the feature dimension of the input. * `hidden_size`: int, the feature dimension of the output. * `batch_first`: (optional) bool, if `True`, then the input and output tensors are provided as `(batch, seq, feature)`. * `dropout`: (optional) float, sets the dropout rate for DropConnect regularization on the recurrent matrix. * `zoneout`: (optional) float, sets the zoneout rate for Zoneout regularization. #### Variables: * `kernel`: the input projection weight matrix. Dimensions (input_size, hidden_size * 3) with `z,r,h` gate layout. Initialized with Xavier uniform initialization. * `recurrent_kernel`: the recurrent projection weight matrix. Dimensions (hidden_size, hidden_size * 3) with `z,r,h` gate layout. Initialized with orthogonal initialization. * `bias`: the input projection bias vector. Dimensions (hidden_size * 3) with `z,r,h` gate layout. Initialized to zeros. * `recurrent_bias`: the recurrent projection bias vector. Dimensions (hidden_size * 3) with `z,r,h` gate layout. Initialized to zeros. ## Methods

__call__

``` python __call__( *input, **kwargs ) ``` Call self as a function.

add_module

``` python add_module( name, module ) ``` Adds a child module to the current module. The module can be accessed as an attribute using the given name. #### Args: name (string): name of the child module. The child module can be accessed from this module using the given name module (Module): child module to be added to the module.

apply

``` python apply(fn) ``` Applies ``fn`` recursively to every submodule (as returned by ``.children()``) as well as self. Typical use includes initializing the parameters of a model (see also :ref:`nn-init-doc`). #### Args: fn (:class:`Module` -> None): function to be applied to each submodule #### Returns: * `Module`: self Example:: ``` >>> def init_weights(m): >>> print(m) >>> if type(m) == nn.Linear: >>> m.weight.data.fill_(1.0) >>> print(m.weight) >>> net = nn.Sequential(nn.Linear(2, 2), nn.Linear(2, 2)) >>> net.apply(init_weights) Linear(in_features=2, out_features=2, bias=True) Parameter containing: tensor([[ 1., 1.], [ 1., 1.]]) Linear(in_features=2, out_features=2, bias=True) Parameter containing: tensor([[ 1., 1.], [ 1., 1.]]) Sequential( (0): Linear(in_features=2, out_features=2, bias=True) (1): Linear(in_features=2, out_features=2, bias=True) ) Sequential( (0): Linear(in_features=2, out_features=2, bias=True) (1): Linear(in_features=2, out_features=2, bias=True) ) ```

buffers

``` python buffers(recurse=True) ``` Returns an iterator over module buffers. #### Args: recurse (bool): if True, then yields buffers of this module and all submodules. Otherwise, yields only buffers that are direct members of this module. #### Yields: * `torch.Tensor`: module buffer Example:: ``` >>> for buf in model.buffers(): >>> print(type(buf.data), buf.size()) (20L,) (20L, 1L, 5L, 5L) ```

children

``` python children() ``` Returns an iterator over immediate children modules. #### Yields: * `Module`: a child module

cpu

``` python cpu() ``` Moves all model parameters and buffers to the CPU. #### Returns: * `Module`: self

cuda

``` python cuda(device=None) ``` Moves all model parameters and buffers to the GPU. This also makes associated parameters and buffers different objects. So it should be called before constructing optimizer if the module will live on GPU while being optimized. #### Arguments: device (int, optional): if specified, all parameters will be copied to that device #### Returns: * `Module`: self

double

``` python double() ``` Casts all floating point parameters and buffers to ``double`` datatype. #### Returns: * `Module`: self

eval

``` python eval() ``` Sets the module in evaluation mode. This has any effect only on certain modules. See documentations of particular modules for details of their behaviors in training/evaluation mode, if they are affected, e.g. :class:`Dropout`, :class:`BatchNorm`, etc. This is equivalent with :meth:`self.train(False) `. #### Returns: * `Module`: self

extra_repr

``` python extra_repr() ``` Set the extra representation of the module To print customized extra information, you should reimplement this method in your own modules. Both single-line and multi-line strings are acceptable.

float

``` python float() ``` Casts all floating point parameters and buffers to float datatype. #### Returns: * `Module`: self

forward

``` python forward( input, state=None, lengths=None ) ``` Runs a forward pass of the GRU layer. #### Arguments: * `input`: Tensor, a batch of input sequences to pass through the GRU. Dimensions (seq_len, batch_size, input_size) if `batch_first` is `False`, otherwise (batch_size, seq_len, input_size). * `lengths`: (optional) Tensor, list of sequence lengths for each batch element. Dimension (batch_size). This argument may be omitted if all batch elements are unpadded and have the same sequence length. #### Returns: * `output`: Tensor, the output of the GRU layer. Dimensions (seq_len, batch_size, hidden_size) if `batch_first` is `False` (default) or (batch_size, seq_len, hidden_size) if `batch_first` is `True`. Note that if `lengths` was specified, the `output` tensor will not be masked. It's the caller's responsibility to either not use the invalid entries or to mask them out before using them. * `h_n`: the hidden state for the last sequence item. Dimensions (1, batch_size, hidden_size).

from_native_weights

``` python from_native_weights( weight_ih_l0, weight_hh_l0, bias_ih_l0, bias_hh_l0 ) ``` Copies and converts the provided PyTorch GRU weights into this layer. #### Arguments: * `weight_ih_l0`: Parameter, the input-hidden weights of the PyTorch GRU layer. * `weight_hh_l0`: Parameter, the hidden-hidden weights of the PyTorch GRU layer. * `bias_ih_l0`: Parameter, the input-hidden bias of the PyTorch GRU layer. * `bias_hh_l0`: Parameter, the hidden-hidden bias of the PyTorch GRU layer.

half

``` python half() ``` Casts all floating point parameters and buffers to ``half`` datatype. #### Returns: * `Module`: self

load_state_dict

``` python load_state_dict( state_dict, strict=True ) ``` Copies parameters and buffers from :attr:`state_dict` into this module and its descendants. If :attr:`strict` is ``True``, then the keys of :attr:`state_dict` must exactly match the keys returned by this module's :meth:`~torch.nn.Module.state_dict` function. #### Arguments: state_dict (dict): a dict containing parameters and persistent buffers. strict (bool, optional): whether to strictly enforce that the keys in :attr:`state_dict` match the keys returned by this module's :meth:`~torch.nn.Module.state_dict` function. Default: ``True`` #### Returns: ``NamedTuple`` with ``missing_keys`` and ``unexpected_keys`` fields: * **missing_keys** is a list of str containing the missing keys * **unexpected_keys** is a list of str containing the unexpected keys

modules

``` python modules() ``` Returns an iterator over all modules in the network. #### Yields: * `Module`: a module in the network #### Note: Duplicate modules are returned only once. In the following example, ``l`` will be returned only once. Example:: ``` >>> l = nn.Linear(2, 2) >>> net = nn.Sequential(l, l) >>> for idx, m in enumerate(net.modules()): print(idx, '->', m) ``` 0 -> Sequential( (0): Linear(in_features=2, out_features=2, bias=True) (1): Linear(in_features=2, out_features=2, bias=True) ) 1 -> Linear(in_features=2, out_features=2, bias=True)

named_buffers

``` python named_buffers( prefix='', recurse=True ) ``` Returns an iterator over module buffers, yielding both the name of the buffer as well as the buffer itself. #### Args: prefix (str): prefix to prepend to all buffer names. recurse (bool): if True, then yields buffers of this module and all submodules. Otherwise, yields only buffers that are direct members of this module. #### Yields: * `(string, torch.Tensor)`: Tuple containing the name and buffer Example:: ``` >>> for name, buf in self.named_buffers(): >>> if name in ['running_var']: >>> print(buf.size()) ```

named_children

``` python named_children() ``` Returns an iterator over immediate children modules, yielding both the name of the module as well as the module itself. #### Yields: * `(string, Module)`: Tuple containing a name and child module Example:: ``` >>> for name, module in model.named_children(): >>> if name in ['conv4', 'conv5']: >>> print(module) ```

named_modules

``` python named_modules( memo=None, prefix='' ) ``` Returns an iterator over all modules in the network, yielding both the name of the module as well as the module itself. #### Yields: * `(string, Module)`: Tuple of name and module #### Note: Duplicate modules are returned only once. In the following example, ``l`` will be returned only once. Example:: ``` >>> l = nn.Linear(2, 2) >>> net = nn.Sequential(l, l) >>> for idx, m in enumerate(net.named_modules()): print(idx, '->', m) ``` 0 -> ('', Sequential( (0): Linear(in_features=2, out_features=2, bias=True) (1): Linear(in_features=2, out_features=2, bias=True) )) 1 -> ('0', Linear(in_features=2, out_features=2, bias=True))

named_parameters

``` python named_parameters( prefix='', recurse=True ) ``` Returns an iterator over module parameters, yielding both the name of the parameter as well as the parameter itself. #### Args: prefix (str): prefix to prepend to all parameter names. recurse (bool): if True, then yields parameters of this module and all submodules. Otherwise, yields only parameters that are direct members of this module. #### Yields: * `(string, Parameter)`: Tuple containing the name and parameter Example:: ``` >>> for name, param in self.named_parameters(): >>> if name in ['bias']: >>> print(param.size()) ```

parameters

``` python parameters(recurse=True) ``` Returns an iterator over module parameters. This is typically passed to an optimizer. #### Args: recurse (bool): if True, then yields parameters of this module and all submodules. Otherwise, yields only parameters that are direct members of this module. #### Yields: * `Parameter`: module parameter Example:: ``` >>> for param in model.parameters(): >>> print(type(param.data), param.size()) (20L,) (20L, 1L, 5L, 5L) ```

register_backward_hook

``` python register_backward_hook(hook) ``` Registers a backward hook on the module. The hook will be called every time the gradients with respect to module inputs are computed. The hook should have the following signature:: hook(module, grad_input, grad_output) -> Tensor or None The :attr:`grad_input` and :attr:`grad_output` may be tuples if the module has multiple inputs or outputs. The hook should not modify its arguments, but it can optionally return a new gradient with respect to input that will be used in place of :attr:`grad_input` in subsequent computations. #### Returns: :class:`torch.utils.hooks.RemovableHandle`: a handle that can be used to remove the added hook by calling ``handle.remove()`` .. warning :: The current implementation will not have the presented behavior for complex :class:`Module` that perform many operations. In some failure cases, :attr:`grad_input` and :attr:`grad_output` will only contain the gradients for a subset of the inputs and outputs. For such :class:`Module`, you should use :func:`torch.Tensor.register_hook` directly on a specific input or output to get the required gradients.

register_buffer

``` python register_buffer( name, tensor ) ``` Adds a persistent buffer to the module. This is typically used to register a buffer that should not to be considered a model parameter. For example, BatchNorm's ``running_mean`` is not a parameter, but is part of the persistent state. Buffers can be accessed as attributes using given names. #### Args: name (string): name of the buffer. The buffer can be accessed from this module using the given name tensor (Tensor): buffer to be registered. Example:: ``` >>> self.register_buffer('running_mean', torch.zeros(num_features)) ```

register_forward_hook

``` python register_forward_hook(hook) ``` Registers a forward hook on the module. The hook will be called every time after :func:`forward` has computed an output. It should have the following signature:: hook(module, input, output) -> None or modified output The hook can modify the output. It can modify the input inplace but it will not have effect on forward since this is called after :func:`forward` is called. #### Returns: :class:`torch.utils.hooks.RemovableHandle`: a handle that can be used to remove the added hook by calling ``handle.remove()``

register_forward_pre_hook

``` python register_forward_pre_hook(hook) ``` Registers a forward pre-hook on the module. The hook will be called every time before :func:`forward` is invoked. It should have the following signature:: hook(module, input) -> None or modified input The hook can modify the input. User can either return a tuple or a single modified value in the hook. We will wrap the value into a tuple if a single value is returned(unless that value is already a tuple). #### Returns: :class:`torch.utils.hooks.RemovableHandle`: a handle that can be used to remove the added hook by calling ``handle.remove()``

register_parameter

``` python register_parameter( name, param ) ``` Adds a parameter to the module. The parameter can be accessed as an attribute using given name. #### Args: name (string): name of the parameter. The parameter can be accessed from this module using the given name param (Parameter): parameter to be added to the module.

requires_grad_

``` python requires_grad_(requires_grad=True) ``` Change if autograd should record operations on parameters in this module. This method sets the parameters' :attr:`requires_grad` attributes in-place. This method is helpful for freezing part of the module for finetuning or training parts of a model individually (e.g., GAN training). #### Args: requires_grad (bool): whether autograd should record operations on parameters in this module. Default: ``True``. #### Returns: * `Module`: self

reset_parameters

``` python reset_parameters() ``` Resets this layer's parameters to their initial values.

share_memory

``` python share_memory() ```

state_dict

``` python state_dict( destination=None, prefix='', keep_vars=False ) ``` Returns a dictionary containing a whole state of the module. Both parameters and persistent buffers (e.g. running averages) are included. Keys are corresponding parameter and buffer names. #### Returns: * `dict`: a dictionary containing a whole state of the module Example:: ``` >>> module.state_dict().keys() ['bias', 'weight'] ```

to

``` python to( *args, **kwargs ) ``` Moves and/or casts the parameters and buffers. This can be called as .. function:: to(device=None, dtype=None, non_blocking=False) .. function:: to(dtype, non_blocking=False) .. function:: to(tensor, non_blocking=False) Its signature is similar to :meth:`torch.Tensor.to`, but only accepts floating point desired :attr:`dtype` s. In addition, this method will only cast the floating point parameters and buffers to :attr:`dtype` (if given). The integral parameters and buffers will be moved :attr:`device`, if that is given, but with dtypes unchanged. When :attr:`non_blocking` is set, it tries to convert/move asynchronously with respect to the host if possible, e.g., moving CPU Tensors with pinned memory to CUDA devices. See below for examples. .. note:: This method modifies the module in-place. #### Args: device (:class:`torch.device`): the desired device of the parameters and buffers in this module dtype (:class:`torch.dtype`): the desired floating point type of the floating point parameters and buffers in this module tensor (torch.Tensor): Tensor whose dtype and device are the desired dtype and device for all parameters and buffers in this module #### Returns: * `Module`: self Example:: ``` >>> linear = nn.Linear(2, 2) >>> linear.weight Parameter containing: tensor([[ 0.1913, -0.3420], [-0.5113, -0.2325]]) >>> linear.to(torch.double) Linear(in_features=2, out_features=2, bias=True) >>> linear.weight Parameter containing: tensor([[ 0.1913, -0.3420], [-0.5113, -0.2325]], dtype=torch.float64) >>> gpu1 = torch.device("cuda:1") >>> linear.to(gpu1, dtype=torch.half, non_blocking=True) Linear(in_features=2, out_features=2, bias=True) >>> linear.weight Parameter containing: tensor([[ 0.1914, -0.3420], [-0.5112, -0.2324]], dtype=torch.float16, device='cuda:1') >>> cpu = torch.device("cpu") >>> linear.to(cpu) Linear(in_features=2, out_features=2, bias=True) >>> linear.weight Parameter containing: tensor([[ 0.1914, -0.3420], [-0.5112, -0.2324]], dtype=torch.float16) ```

to_native_weights

``` python to_native_weights() ``` Converts Haste GRU weights to native PyTorch GRU weights. #### Returns: * `weight_ih_l0`: Parameter, the input-hidden weights of the GRU layer. * `weight_hh_l0`: Parameter, the hidden-hidden weights of the GRU layer. * `bias_ih_l0`: Parameter, the input-hidden bias of the GRU layer. * `bias_hh_l0`: Parameter, the hidden-hidden bias of the GRU layer.

train

``` python train(mode=True) ``` Sets the module in training mode. This has any effect only on certain modules. See documentations of particular modules for details of their behaviors in training/evaluation mode, if they are affected, e.g. :class:`Dropout`, :class:`BatchNorm`, etc. #### Args: mode (bool): whether to set training mode (``True``) or evaluation mode (``False``). Default: ``True``. #### Returns: * `Module`: self

type

``` python type(dst_type) ``` Casts all parameters and buffers to :attr:`dst_type`. #### Arguments: dst_type (type or string): the desired type #### Returns: * `Module`: self

zero_grad

``` python zero_grad() ``` Sets gradients of all model parameters to zero. ================================================ FILE: docs/pytorch/haste_pytorch/IndRNN.md ================================================
# haste_pytorch.IndRNN ## Class `IndRNN` Independently Recurrent Neural Network layer. This layer offers a fused, GPU-accelerated PyTorch op for inference and training. It also supports Zoneout regularization. See [\_\_init\_\_](#__init__) and [forward](#forward) for usage.

__init__

``` python __init__( input_size, hidden_size, batch_first=False, zoneout=0.0 ) ``` Initialize the parameters of the IndRNN layer. #### Arguments: * `input_size`: int, the feature dimension of the input. * `hidden_size`: int, the feature dimension of the output. * `batch_first`: (optional) bool, if `True`, then the input and output tensors are provided as `(batch, seq, feature)`. * `zoneout`: (optional) float, sets the zoneout rate for Zoneout regularization. #### Variables: * `kernel`: the input projection weight matrix. Dimensions (input_size, hidden_size). Initialized with Xavier uniform initialization. * `recurrent_scale`: the recurrent scale weight vector. Dimensions (hidden_size). Initialized uniformly in [-0.5, 0.5]. Note that this initialization scheme is different than in the original authors' implementation. See https://github.com/lmnt-com/haste/issues/7 for details. * `bias`: the RNN bias vector. Dimensions (hidden_size). Initialized to zeros. ## Methods

__call__

``` python __call__( *input, **kwargs ) ``` Call self as a function.

add_module

``` python add_module( name, module ) ``` Adds a child module to the current module. The module can be accessed as an attribute using the given name. #### Args: name (string): name of the child module. The child module can be accessed from this module using the given name module (Module): child module to be added to the module.

apply

``` python apply(fn) ``` Applies ``fn`` recursively to every submodule (as returned by ``.children()``) as well as self. Typical use includes initializing the parameters of a model (see also :ref:`nn-init-doc`). #### Args: fn (:class:`Module` -> None): function to be applied to each submodule #### Returns: * `Module`: self Example:: ``` >>> def init_weights(m): >>> print(m) >>> if type(m) == nn.Linear: >>> m.weight.data.fill_(1.0) >>> print(m.weight) >>> net = nn.Sequential(nn.Linear(2, 2), nn.Linear(2, 2)) >>> net.apply(init_weights) Linear(in_features=2, out_features=2, bias=True) Parameter containing: tensor([[ 1., 1.], [ 1., 1.]]) Linear(in_features=2, out_features=2, bias=True) Parameter containing: tensor([[ 1., 1.], [ 1., 1.]]) Sequential( (0): Linear(in_features=2, out_features=2, bias=True) (1): Linear(in_features=2, out_features=2, bias=True) ) Sequential( (0): Linear(in_features=2, out_features=2, bias=True) (1): Linear(in_features=2, out_features=2, bias=True) ) ```

buffers

``` python buffers(recurse=True) ``` Returns an iterator over module buffers. #### Args: recurse (bool): if True, then yields buffers of this module and all submodules. Otherwise, yields only buffers that are direct members of this module. #### Yields: * `torch.Tensor`: module buffer Example:: ``` >>> for buf in model.buffers(): >>> print(type(buf.data), buf.size()) (20L,) (20L, 1L, 5L, 5L) ```

children

``` python children() ``` Returns an iterator over immediate children modules. #### Yields: * `Module`: a child module

cpu

``` python cpu() ``` Moves all model parameters and buffers to the CPU. #### Returns: * `Module`: self

cuda

``` python cuda(device=None) ``` Moves all model parameters and buffers to the GPU. This also makes associated parameters and buffers different objects. So it should be called before constructing optimizer if the module will live on GPU while being optimized. #### Arguments: device (int, optional): if specified, all parameters will be copied to that device #### Returns: * `Module`: self

double

``` python double() ``` Casts all floating point parameters and buffers to ``double`` datatype. #### Returns: * `Module`: self

eval

``` python eval() ``` Sets the module in evaluation mode. This has any effect only on certain modules. See documentations of particular modules for details of their behaviors in training/evaluation mode, if they are affected, e.g. :class:`Dropout`, :class:`BatchNorm`, etc. This is equivalent with :meth:`self.train(False) `. #### Returns: * `Module`: self

extra_repr

``` python extra_repr() ``` Set the extra representation of the module To print customized extra information, you should reimplement this method in your own modules. Both single-line and multi-line strings are acceptable.

float

``` python float() ``` Casts all floating point parameters and buffers to float datatype. #### Returns: * `Module`: self

forward

``` python forward( input, state=None, lengths=None ) ``` Runs a forward pass of the IndRNN layer. #### Arguments: * `input`: Tensor, a batch of input sequences to pass through the GRU. Dimensions (seq_len, batch_size, input_size) if `batch_first` is `False`, otherwise (batch_size, seq_len, input_size). * `state`: (optional) Tensor, the initial state for each batch element in `input`. Dimensions (1, batch_size, hidden_size). Defaults to zeros. * `lengths`: (optional) Tensor, list of sequence lengths for each batch element. Dimension (batch_size). This argument may be omitted if all batch elements are unpadded and have the same sequence length. #### Returns: * `output`: Tensor, the output of the GRU layer. Dimensions (seq_len, batch_size, hidden_size) if `batch_first` is `False` (default) or (batch_size, seq_len, hidden_size) if `batch_first` is `True`. Note that if `lengths` was specified, the `output` tensor will not be masked. It's the caller's responsibility to either not use the invalid entries or to mask them out before using them. * `state`: the hidden state for the last sequence item. Dimensions (1, batch_size, hidden_size).

half

``` python half() ``` Casts all floating point parameters and buffers to ``half`` datatype. #### Returns: * `Module`: self

load_state_dict

``` python load_state_dict( state_dict, strict=True ) ``` Copies parameters and buffers from :attr:`state_dict` into this module and its descendants. If :attr:`strict` is ``True``, then the keys of :attr:`state_dict` must exactly match the keys returned by this module's :meth:`~torch.nn.Module.state_dict` function. #### Arguments: state_dict (dict): a dict containing parameters and persistent buffers. strict (bool, optional): whether to strictly enforce that the keys in :attr:`state_dict` match the keys returned by this module's :meth:`~torch.nn.Module.state_dict` function. Default: ``True`` #### Returns: ``NamedTuple`` with ``missing_keys`` and ``unexpected_keys`` fields: * **missing_keys** is a list of str containing the missing keys * **unexpected_keys** is a list of str containing the unexpected keys

modules

``` python modules() ``` Returns an iterator over all modules in the network. #### Yields: * `Module`: a module in the network #### Note: Duplicate modules are returned only once. In the following example, ``l`` will be returned only once. Example:: ``` >>> l = nn.Linear(2, 2) >>> net = nn.Sequential(l, l) >>> for idx, m in enumerate(net.modules()): print(idx, '->', m) ``` 0 -> Sequential( (0): Linear(in_features=2, out_features=2, bias=True) (1): Linear(in_features=2, out_features=2, bias=True) ) 1 -> Linear(in_features=2, out_features=2, bias=True)

named_buffers

``` python named_buffers( prefix='', recurse=True ) ``` Returns an iterator over module buffers, yielding both the name of the buffer as well as the buffer itself. #### Args: prefix (str): prefix to prepend to all buffer names. recurse (bool): if True, then yields buffers of this module and all submodules. Otherwise, yields only buffers that are direct members of this module. #### Yields: * `(string, torch.Tensor)`: Tuple containing the name and buffer Example:: ``` >>> for name, buf in self.named_buffers(): >>> if name in ['running_var']: >>> print(buf.size()) ```

named_children

``` python named_children() ``` Returns an iterator over immediate children modules, yielding both the name of the module as well as the module itself. #### Yields: * `(string, Module)`: Tuple containing a name and child module Example:: ``` >>> for name, module in model.named_children(): >>> if name in ['conv4', 'conv5']: >>> print(module) ```

named_modules

``` python named_modules( memo=None, prefix='' ) ``` Returns an iterator over all modules in the network, yielding both the name of the module as well as the module itself. #### Yields: * `(string, Module)`: Tuple of name and module #### Note: Duplicate modules are returned only once. In the following example, ``l`` will be returned only once. Example:: ``` >>> l = nn.Linear(2, 2) >>> net = nn.Sequential(l, l) >>> for idx, m in enumerate(net.named_modules()): print(idx, '->', m) ``` 0 -> ('', Sequential( (0): Linear(in_features=2, out_features=2, bias=True) (1): Linear(in_features=2, out_features=2, bias=True) )) 1 -> ('0', Linear(in_features=2, out_features=2, bias=True))

named_parameters

``` python named_parameters( prefix='', recurse=True ) ``` Returns an iterator over module parameters, yielding both the name of the parameter as well as the parameter itself. #### Args: prefix (str): prefix to prepend to all parameter names. recurse (bool): if True, then yields parameters of this module and all submodules. Otherwise, yields only parameters that are direct members of this module. #### Yields: * `(string, Parameter)`: Tuple containing the name and parameter Example:: ``` >>> for name, param in self.named_parameters(): >>> if name in ['bias']: >>> print(param.size()) ```

parameters

``` python parameters(recurse=True) ``` Returns an iterator over module parameters. This is typically passed to an optimizer. #### Args: recurse (bool): if True, then yields parameters of this module and all submodules. Otherwise, yields only parameters that are direct members of this module. #### Yields: * `Parameter`: module parameter Example:: ``` >>> for param in model.parameters(): >>> print(type(param.data), param.size()) (20L,) (20L, 1L, 5L, 5L) ```

register_backward_hook

``` python register_backward_hook(hook) ``` Registers a backward hook on the module. The hook will be called every time the gradients with respect to module inputs are computed. The hook should have the following signature:: hook(module, grad_input, grad_output) -> Tensor or None The :attr:`grad_input` and :attr:`grad_output` may be tuples if the module has multiple inputs or outputs. The hook should not modify its arguments, but it can optionally return a new gradient with respect to input that will be used in place of :attr:`grad_input` in subsequent computations. #### Returns: :class:`torch.utils.hooks.RemovableHandle`: a handle that can be used to remove the added hook by calling ``handle.remove()`` .. warning :: The current implementation will not have the presented behavior for complex :class:`Module` that perform many operations. In some failure cases, :attr:`grad_input` and :attr:`grad_output` will only contain the gradients for a subset of the inputs and outputs. For such :class:`Module`, you should use :func:`torch.Tensor.register_hook` directly on a specific input or output to get the required gradients.

register_buffer

``` python register_buffer( name, tensor ) ``` Adds a persistent buffer to the module. This is typically used to register a buffer that should not to be considered a model parameter. For example, BatchNorm's ``running_mean`` is not a parameter, but is part of the persistent state. Buffers can be accessed as attributes using given names. #### Args: name (string): name of the buffer. The buffer can be accessed from this module using the given name tensor (Tensor): buffer to be registered. Example:: ``` >>> self.register_buffer('running_mean', torch.zeros(num_features)) ```

register_forward_hook

``` python register_forward_hook(hook) ``` Registers a forward hook on the module. The hook will be called every time after :func:`forward` has computed an output. It should have the following signature:: hook(module, input, output) -> None or modified output The hook can modify the output. It can modify the input inplace but it will not have effect on forward since this is called after :func:`forward` is called. #### Returns: :class:`torch.utils.hooks.RemovableHandle`: a handle that can be used to remove the added hook by calling ``handle.remove()``

register_forward_pre_hook

``` python register_forward_pre_hook(hook) ``` Registers a forward pre-hook on the module. The hook will be called every time before :func:`forward` is invoked. It should have the following signature:: hook(module, input) -> None or modified input The hook can modify the input. User can either return a tuple or a single modified value in the hook. We will wrap the value into a tuple if a single value is returned(unless that value is already a tuple). #### Returns: :class:`torch.utils.hooks.RemovableHandle`: a handle that can be used to remove the added hook by calling ``handle.remove()``

register_parameter

``` python register_parameter( name, param ) ``` Adds a parameter to the module. The parameter can be accessed as an attribute using given name. #### Args: name (string): name of the parameter. The parameter can be accessed from this module using the given name param (Parameter): parameter to be added to the module.

requires_grad_

``` python requires_grad_(requires_grad=True) ``` Change if autograd should record operations on parameters in this module. This method sets the parameters' :attr:`requires_grad` attributes in-place. This method is helpful for freezing part of the module for finetuning or training parts of a model individually (e.g., GAN training). #### Args: requires_grad (bool): whether autograd should record operations on parameters in this module. Default: ``True``. #### Returns: * `Module`: self

reset_parameters

``` python reset_parameters() ```

share_memory

``` python share_memory() ```

state_dict

``` python state_dict( destination=None, prefix='', keep_vars=False ) ``` Returns a dictionary containing a whole state of the module. Both parameters and persistent buffers (e.g. running averages) are included. Keys are corresponding parameter and buffer names. #### Returns: * `dict`: a dictionary containing a whole state of the module Example:: ``` >>> module.state_dict().keys() ['bias', 'weight'] ```

to

``` python to( *args, **kwargs ) ``` Moves and/or casts the parameters and buffers. This can be called as .. function:: to(device=None, dtype=None, non_blocking=False) .. function:: to(dtype, non_blocking=False) .. function:: to(tensor, non_blocking=False) Its signature is similar to :meth:`torch.Tensor.to`, but only accepts floating point desired :attr:`dtype` s. In addition, this method will only cast the floating point parameters and buffers to :attr:`dtype` (if given). The integral parameters and buffers will be moved :attr:`device`, if that is given, but with dtypes unchanged. When :attr:`non_blocking` is set, it tries to convert/move asynchronously with respect to the host if possible, e.g., moving CPU Tensors with pinned memory to CUDA devices. See below for examples. .. note:: This method modifies the module in-place. #### Args: device (:class:`torch.device`): the desired device of the parameters and buffers in this module dtype (:class:`torch.dtype`): the desired floating point type of the floating point parameters and buffers in this module tensor (torch.Tensor): Tensor whose dtype and device are the desired dtype and device for all parameters and buffers in this module #### Returns: * `Module`: self Example:: ``` >>> linear = nn.Linear(2, 2) >>> linear.weight Parameter containing: tensor([[ 0.1913, -0.3420], [-0.5113, -0.2325]]) >>> linear.to(torch.double) Linear(in_features=2, out_features=2, bias=True) >>> linear.weight Parameter containing: tensor([[ 0.1913, -0.3420], [-0.5113, -0.2325]], dtype=torch.float64) >>> gpu1 = torch.device("cuda:1") >>> linear.to(gpu1, dtype=torch.half, non_blocking=True) Linear(in_features=2, out_features=2, bias=True) >>> linear.weight Parameter containing: tensor([[ 0.1914, -0.3420], [-0.5112, -0.2324]], dtype=torch.float16, device='cuda:1') >>> cpu = torch.device("cpu") >>> linear.to(cpu) Linear(in_features=2, out_features=2, bias=True) >>> linear.weight Parameter containing: tensor([[ 0.1914, -0.3420], [-0.5112, -0.2324]], dtype=torch.float16) ```

train

``` python train(mode=True) ``` Sets the module in training mode. This has any effect only on certain modules. See documentations of particular modules for details of their behaviors in training/evaluation mode, if they are affected, e.g. :class:`Dropout`, :class:`BatchNorm`, etc. #### Args: mode (bool): whether to set training mode (``True``) or evaluation mode (``False``). Default: ``True``. #### Returns: * `Module`: self

type

``` python type(dst_type) ``` Casts all parameters and buffers to :attr:`dst_type`. #### Arguments: dst_type (type or string): the desired type #### Returns: * `Module`: self

zero_grad

``` python zero_grad() ``` Sets gradients of all model parameters to zero. ================================================ FILE: docs/pytorch/haste_pytorch/LSTM.md ================================================
# haste_pytorch.LSTM ## Class `LSTM` Long Short-Term Memory layer. This LSTM layer offers a fused, GPU-accelerated PyTorch op for inference and training. Although this implementation is comparable in performance to cuDNN's LSTM, it offers additional options not typically found in other high-performance implementations. DropConnect and Zoneout regularization are built-in, and this layer allows setting a non-zero initial forget gate bias. See [\_\_init\_\_](#__init__) and [forward](#forward) for general usage. See [from_native_weights](#from_native_weights) and [to_native_weights](#to_native_weights) for compatibility with PyTorch LSTMs.

__init__

``` python __init__( input_size, hidden_size, batch_first=False, forget_bias=1.0, dropout=0.0, zoneout=0.0 ) ``` Initialize the parameters of the LSTM layer. #### Arguments: * `input_size`: int, the feature dimension of the input. * `hidden_size`: int, the feature dimension of the output. * `batch_first`: (optional) bool, if `True`, then the input and output tensors are provided as `(batch, seq, feature)`. * `forget_bias`: (optional) float, sets the initial bias of the forget gate for this LSTM cell. * `dropout`: (optional) float, sets the dropout rate for DropConnect regularization on the recurrent matrix. * `zoneout`: (optional) float, sets the zoneout rate for Zoneout regularization. #### Variables: * `kernel`: the input projection weight matrix. Dimensions (input_size, hidden_size * 4) with `i,g,f,o` gate layout. Initialized with Xavier uniform initialization. * `recurrent_kernel`: the recurrent projection weight matrix. Dimensions (hidden_size, hidden_size * 4) with `i,g,f,o` gate layout. Initialized with orthogonal initialization. * `bias`: the projection bias vector. Dimensions (hidden_size * 4) with `i,g,f,o` gate layout. The forget gate biases are initialized to `forget_bias` and the rest are zeros. ## Methods

__call__

``` python __call__( *input, **kwargs ) ``` Call self as a function.

add_module

``` python add_module( name, module ) ``` Adds a child module to the current module. The module can be accessed as an attribute using the given name. #### Args: name (string): name of the child module. The child module can be accessed from this module using the given name module (Module): child module to be added to the module.

apply

``` python apply(fn) ``` Applies ``fn`` recursively to every submodule (as returned by ``.children()``) as well as self. Typical use includes initializing the parameters of a model (see also :ref:`nn-init-doc`). #### Args: fn (:class:`Module` -> None): function to be applied to each submodule #### Returns: * `Module`: self Example:: ``` >>> def init_weights(m): >>> print(m) >>> if type(m) == nn.Linear: >>> m.weight.data.fill_(1.0) >>> print(m.weight) >>> net = nn.Sequential(nn.Linear(2, 2), nn.Linear(2, 2)) >>> net.apply(init_weights) Linear(in_features=2, out_features=2, bias=True) Parameter containing: tensor([[ 1., 1.], [ 1., 1.]]) Linear(in_features=2, out_features=2, bias=True) Parameter containing: tensor([[ 1., 1.], [ 1., 1.]]) Sequential( (0): Linear(in_features=2, out_features=2, bias=True) (1): Linear(in_features=2, out_features=2, bias=True) ) Sequential( (0): Linear(in_features=2, out_features=2, bias=True) (1): Linear(in_features=2, out_features=2, bias=True) ) ```

buffers

``` python buffers(recurse=True) ``` Returns an iterator over module buffers. #### Args: recurse (bool): if True, then yields buffers of this module and all submodules. Otherwise, yields only buffers that are direct members of this module. #### Yields: * `torch.Tensor`: module buffer Example:: ``` >>> for buf in model.buffers(): >>> print(type(buf.data), buf.size()) (20L,) (20L, 1L, 5L, 5L) ```

children

``` python children() ``` Returns an iterator over immediate children modules. #### Yields: * `Module`: a child module

cpu

``` python cpu() ``` Moves all model parameters and buffers to the CPU. #### Returns: * `Module`: self

cuda

``` python cuda(device=None) ``` Moves all model parameters and buffers to the GPU. This also makes associated parameters and buffers different objects. So it should be called before constructing optimizer if the module will live on GPU while being optimized. #### Arguments: device (int, optional): if specified, all parameters will be copied to that device #### Returns: * `Module`: self

double

``` python double() ``` Casts all floating point parameters and buffers to ``double`` datatype. #### Returns: * `Module`: self

eval

``` python eval() ``` Sets the module in evaluation mode. This has any effect only on certain modules. See documentations of particular modules for details of their behaviors in training/evaluation mode, if they are affected, e.g. :class:`Dropout`, :class:`BatchNorm`, etc. This is equivalent with :meth:`self.train(False) `. #### Returns: * `Module`: self

extra_repr

``` python extra_repr() ``` Set the extra representation of the module To print customized extra information, you should reimplement this method in your own modules. Both single-line and multi-line strings are acceptable.

float

``` python float() ``` Casts all floating point parameters and buffers to float datatype. #### Returns: * `Module`: self

forward

``` python forward( input, state=None, lengths=None ) ``` Runs a forward pass of the LSTM layer. #### Arguments: * `input`: Tensor, a batch of input sequences to pass through the LSTM. Dimensions (seq_len, batch_size, input_size) if `batch_first` is `False`, otherwise (batch_size, seq_len, input_size). * `lengths`: (optional) Tensor, list of sequence lengths for each batch element. Dimension (batch_size). This argument may be omitted if all batch elements are unpadded and have the same sequence length. #### Returns: * `output`: Tensor, the output of the LSTM layer. Dimensions (seq_len, batch_size, hidden_size) if `batch_first` is `False` (default) or (batch_size, seq_len, hidden_size) if `batch_first` is `True`. Note that if `lengths` was specified, the `output` tensor will not be masked. It's the caller's responsibility to either not use the invalid entries or to mask them out before using them. * `(h_n, c_n)`: the hidden and cell states, respectively, for the last sequence item. Dimensions (1, batch_size, hidden_size).

from_native_weights

``` python from_native_weights( weight_ih_l0, weight_hh_l0, bias_ih_l0, bias_hh_l0 ) ``` Copies and converts the provided PyTorch LSTM weights into this layer. #### Arguments: * `weight_ih_l0`: Parameter, the input-hidden weights of the PyTorch LSTM layer. * `weight_hh_l0`: Parameter, the hidden-hidden weights of the PyTorch LSTM layer. * `bias_ih_l0`: Parameter, the input-hidden bias of the PyTorch LSTM layer. * `bias_hh_l0`: Parameter, the hidden-hidden bias of the PyTorch LSTM layer.

half

``` python half() ``` Casts all floating point parameters and buffers to ``half`` datatype. #### Returns: * `Module`: self

load_state_dict

``` python load_state_dict( state_dict, strict=True ) ``` Copies parameters and buffers from :attr:`state_dict` into this module and its descendants. If :attr:`strict` is ``True``, then the keys of :attr:`state_dict` must exactly match the keys returned by this module's :meth:`~torch.nn.Module.state_dict` function. #### Arguments: state_dict (dict): a dict containing parameters and persistent buffers. strict (bool, optional): whether to strictly enforce that the keys in :attr:`state_dict` match the keys returned by this module's :meth:`~torch.nn.Module.state_dict` function. Default: ``True`` #### Returns: ``NamedTuple`` with ``missing_keys`` and ``unexpected_keys`` fields: * **missing_keys** is a list of str containing the missing keys * **unexpected_keys** is a list of str containing the unexpected keys

modules

``` python modules() ``` Returns an iterator over all modules in the network. #### Yields: * `Module`: a module in the network #### Note: Duplicate modules are returned only once. In the following example, ``l`` will be returned only once. Example:: ``` >>> l = nn.Linear(2, 2) >>> net = nn.Sequential(l, l) >>> for idx, m in enumerate(net.modules()): print(idx, '->', m) ``` 0 -> Sequential( (0): Linear(in_features=2, out_features=2, bias=True) (1): Linear(in_features=2, out_features=2, bias=True) ) 1 -> Linear(in_features=2, out_features=2, bias=True)

named_buffers

``` python named_buffers( prefix='', recurse=True ) ``` Returns an iterator over module buffers, yielding both the name of the buffer as well as the buffer itself. #### Args: prefix (str): prefix to prepend to all buffer names. recurse (bool): if True, then yields buffers of this module and all submodules. Otherwise, yields only buffers that are direct members of this module. #### Yields: * `(string, torch.Tensor)`: Tuple containing the name and buffer Example:: ``` >>> for name, buf in self.named_buffers(): >>> if name in ['running_var']: >>> print(buf.size()) ```

named_children

``` python named_children() ``` Returns an iterator over immediate children modules, yielding both the name of the module as well as the module itself. #### Yields: * `(string, Module)`: Tuple containing a name and child module Example:: ``` >>> for name, module in model.named_children(): >>> if name in ['conv4', 'conv5']: >>> print(module) ```

named_modules

``` python named_modules( memo=None, prefix='' ) ``` Returns an iterator over all modules in the network, yielding both the name of the module as well as the module itself. #### Yields: * `(string, Module)`: Tuple of name and module #### Note: Duplicate modules are returned only once. In the following example, ``l`` will be returned only once. Example:: ``` >>> l = nn.Linear(2, 2) >>> net = nn.Sequential(l, l) >>> for idx, m in enumerate(net.named_modules()): print(idx, '->', m) ``` 0 -> ('', Sequential( (0): Linear(in_features=2, out_features=2, bias=True) (1): Linear(in_features=2, out_features=2, bias=True) )) 1 -> ('0', Linear(in_features=2, out_features=2, bias=True))

named_parameters

``` python named_parameters( prefix='', recurse=True ) ``` Returns an iterator over module parameters, yielding both the name of the parameter as well as the parameter itself. #### Args: prefix (str): prefix to prepend to all parameter names. recurse (bool): if True, then yields parameters of this module and all submodules. Otherwise, yields only parameters that are direct members of this module. #### Yields: * `(string, Parameter)`: Tuple containing the name and parameter Example:: ``` >>> for name, param in self.named_parameters(): >>> if name in ['bias']: >>> print(param.size()) ```

parameters

``` python parameters(recurse=True) ``` Returns an iterator over module parameters. This is typically passed to an optimizer. #### Args: recurse (bool): if True, then yields parameters of this module and all submodules. Otherwise, yields only parameters that are direct members of this module. #### Yields: * `Parameter`: module parameter Example:: ``` >>> for param in model.parameters(): >>> print(type(param.data), param.size()) (20L,) (20L, 1L, 5L, 5L) ```

register_backward_hook

``` python register_backward_hook(hook) ``` Registers a backward hook on the module. The hook will be called every time the gradients with respect to module inputs are computed. The hook should have the following signature:: hook(module, grad_input, grad_output) -> Tensor or None The :attr:`grad_input` and :attr:`grad_output` may be tuples if the module has multiple inputs or outputs. The hook should not modify its arguments, but it can optionally return a new gradient with respect to input that will be used in place of :attr:`grad_input` in subsequent computations. #### Returns: :class:`torch.utils.hooks.RemovableHandle`: a handle that can be used to remove the added hook by calling ``handle.remove()`` .. warning :: The current implementation will not have the presented behavior for complex :class:`Module` that perform many operations. In some failure cases, :attr:`grad_input` and :attr:`grad_output` will only contain the gradients for a subset of the inputs and outputs. For such :class:`Module`, you should use :func:`torch.Tensor.register_hook` directly on a specific input or output to get the required gradients.

register_buffer

``` python register_buffer( name, tensor ) ``` Adds a persistent buffer to the module. This is typically used to register a buffer that should not to be considered a model parameter. For example, BatchNorm's ``running_mean`` is not a parameter, but is part of the persistent state. Buffers can be accessed as attributes using given names. #### Args: name (string): name of the buffer. The buffer can be accessed from this module using the given name tensor (Tensor): buffer to be registered. Example:: ``` >>> self.register_buffer('running_mean', torch.zeros(num_features)) ```

register_forward_hook

``` python register_forward_hook(hook) ``` Registers a forward hook on the module. The hook will be called every time after :func:`forward` has computed an output. It should have the following signature:: hook(module, input, output) -> None or modified output The hook can modify the output. It can modify the input inplace but it will not have effect on forward since this is called after :func:`forward` is called. #### Returns: :class:`torch.utils.hooks.RemovableHandle`: a handle that can be used to remove the added hook by calling ``handle.remove()``

register_forward_pre_hook

``` python register_forward_pre_hook(hook) ``` Registers a forward pre-hook on the module. The hook will be called every time before :func:`forward` is invoked. It should have the following signature:: hook(module, input) -> None or modified input The hook can modify the input. User can either return a tuple or a single modified value in the hook. We will wrap the value into a tuple if a single value is returned(unless that value is already a tuple). #### Returns: :class:`torch.utils.hooks.RemovableHandle`: a handle that can be used to remove the added hook by calling ``handle.remove()``

register_parameter

``` python register_parameter( name, param ) ``` Adds a parameter to the module. The parameter can be accessed as an attribute using given name. #### Args: name (string): name of the parameter. The parameter can be accessed from this module using the given name param (Parameter): parameter to be added to the module.

requires_grad_

``` python requires_grad_(requires_grad=True) ``` Change if autograd should record operations on parameters in this module. This method sets the parameters' :attr:`requires_grad` attributes in-place. This method is helpful for freezing part of the module for finetuning or training parts of a model individually (e.g., GAN training). #### Args: requires_grad (bool): whether autograd should record operations on parameters in this module. Default: ``True``. #### Returns: * `Module`: self

reset_parameters

``` python reset_parameters() ``` Resets this layer's parameters to their initial values.

share_memory

``` python share_memory() ```

state_dict

``` python state_dict( destination=None, prefix='', keep_vars=False ) ``` Returns a dictionary containing a whole state of the module. Both parameters and persistent buffers (e.g. running averages) are included. Keys are corresponding parameter and buffer names. #### Returns: * `dict`: a dictionary containing a whole state of the module Example:: ``` >>> module.state_dict().keys() ['bias', 'weight'] ```

to

``` python to( *args, **kwargs ) ``` Moves and/or casts the parameters and buffers. This can be called as .. function:: to(device=None, dtype=None, non_blocking=False) .. function:: to(dtype, non_blocking=False) .. function:: to(tensor, non_blocking=False) Its signature is similar to :meth:`torch.Tensor.to`, but only accepts floating point desired :attr:`dtype` s. In addition, this method will only cast the floating point parameters and buffers to :attr:`dtype` (if given). The integral parameters and buffers will be moved :attr:`device`, if that is given, but with dtypes unchanged. When :attr:`non_blocking` is set, it tries to convert/move asynchronously with respect to the host if possible, e.g., moving CPU Tensors with pinned memory to CUDA devices. See below for examples. .. note:: This method modifies the module in-place. #### Args: device (:class:`torch.device`): the desired device of the parameters and buffers in this module dtype (:class:`torch.dtype`): the desired floating point type of the floating point parameters and buffers in this module tensor (torch.Tensor): Tensor whose dtype and device are the desired dtype and device for all parameters and buffers in this module #### Returns: * `Module`: self Example:: ``` >>> linear = nn.Linear(2, 2) >>> linear.weight Parameter containing: tensor([[ 0.1913, -0.3420], [-0.5113, -0.2325]]) >>> linear.to(torch.double) Linear(in_features=2, out_features=2, bias=True) >>> linear.weight Parameter containing: tensor([[ 0.1913, -0.3420], [-0.5113, -0.2325]], dtype=torch.float64) >>> gpu1 = torch.device("cuda:1") >>> linear.to(gpu1, dtype=torch.half, non_blocking=True) Linear(in_features=2, out_features=2, bias=True) >>> linear.weight Parameter containing: tensor([[ 0.1914, -0.3420], [-0.5112, -0.2324]], dtype=torch.float16, device='cuda:1') >>> cpu = torch.device("cpu") >>> linear.to(cpu) Linear(in_features=2, out_features=2, bias=True) >>> linear.weight Parameter containing: tensor([[ 0.1914, -0.3420], [-0.5112, -0.2324]], dtype=torch.float16) ```

to_native_weights

``` python to_native_weights() ``` Converts Haste LSTM weights to native PyTorch LSTM weights. #### Returns: * `weight_ih_l0`: Parameter, the input-hidden weights of the LSTM layer. * `weight_hh_l0`: Parameter, the hidden-hidden weights of the LSTM layer. * `bias_ih_l0`: Parameter, the input-hidden bias of the LSTM layer. * `bias_hh_l0`: Parameter, the hidden-hidden bias of the LSTM layer.

train

``` python train(mode=True) ``` Sets the module in training mode. This has any effect only on certain modules. See documentations of particular modules for details of their behaviors in training/evaluation mode, if they are affected, e.g. :class:`Dropout`, :class:`BatchNorm`, etc. #### Args: mode (bool): whether to set training mode (``True``) or evaluation mode (``False``). Default: ``True``. #### Returns: * `Module`: self

type

``` python type(dst_type) ``` Casts all parameters and buffers to :attr:`dst_type`. #### Arguments: dst_type (type or string): the desired type #### Returns: * `Module`: self

zero_grad

``` python zero_grad() ``` Sets gradients of all model parameters to zero. ================================================ FILE: docs/pytorch/haste_pytorch/LayerNormGRU.md ================================================
# haste_pytorch.LayerNormGRU ## Class `LayerNormGRU` Layer Normalized Gated Recurrent Unit layer. This GRU layer applies layer normalization to the input and recurrent output activations of a standard GRU. The implementation is fused and GPU-accelerated. There are two commonly-used variants of GRU cells. This one implements 1406.1078v1 which applies the reset gate to the hidden state after matrix multiplication. The other variant, 1406.1078v3, applies the reset gate before matrix multiplication and is currently unsupported. This layer has built-in support for DropConnect and Zoneout, which are both techniques used to regularize RNNs. See [\_\_init\_\_](#__init__) and [forward](#forward) for usage.

__init__

``` python __init__( input_size, hidden_size, batch_first=False, dropout=0.0, zoneout=0.0 ) ``` Initialize the parameters of the GRU layer. #### Arguments: * `input_size`: int, the feature dimension of the input. * `hidden_size`: int, the feature dimension of the output. * `batch_first`: (optional) bool, if `True`, then the input and output tensors are provided as `(batch, seq, feature)`. * `dropout`: (optional) float, sets the dropout rate for DropConnect regularization on the recurrent matrix. * `zoneout`: (optional) float, sets the zoneout rate for Zoneout regularization. #### Variables: * `kernel`: the input projection weight matrix. Dimensions (input_size, hidden_size * 3) with `z,r,h` gate layout. Initialized with Xavier uniform initialization. * `recurrent_kernel`: the recurrent projection weight matrix. Dimensions (hidden_size, hidden_size * 3) with `z,r,h` gate layout. Initialized with orthogonal initialization. * `bias`: the input projection bias vector. Dimensions (hidden_size * 3) with `z,r,h` gate layout. Initialized to zeros. * `recurrent_bias`: the recurrent projection bias vector. Dimensions (hidden_size * 3) with `z,r,h` gate layout. Initialized to zeros. * `gamma`: the input and recurrent normalization gain. Dimensions (2, hidden_size * 4) with `gamma[0]` specifying the input gain and `gamma[1]` specifying the recurrent gain. Initialized to ones. ## Methods

__call__

``` python __call__( *input, **kwargs ) ``` Call self as a function.

add_module

``` python add_module( name, module ) ``` Adds a child module to the current module. The module can be accessed as an attribute using the given name. #### Args: name (string): name of the child module. The child module can be accessed from this module using the given name module (Module): child module to be added to the module.

apply

``` python apply(fn) ``` Applies ``fn`` recursively to every submodule (as returned by ``.children()``) as well as self. Typical use includes initializing the parameters of a model (see also :ref:`nn-init-doc`). #### Args: fn (:class:`Module` -> None): function to be applied to each submodule #### Returns: * `Module`: self Example:: ``` >>> def init_weights(m): >>> print(m) >>> if type(m) == nn.Linear: >>> m.weight.data.fill_(1.0) >>> print(m.weight) >>> net = nn.Sequential(nn.Linear(2, 2), nn.Linear(2, 2)) >>> net.apply(init_weights) Linear(in_features=2, out_features=2, bias=True) Parameter containing: tensor([[ 1., 1.], [ 1., 1.]]) Linear(in_features=2, out_features=2, bias=True) Parameter containing: tensor([[ 1., 1.], [ 1., 1.]]) Sequential( (0): Linear(in_features=2, out_features=2, bias=True) (1): Linear(in_features=2, out_features=2, bias=True) ) Sequential( (0): Linear(in_features=2, out_features=2, bias=True) (1): Linear(in_features=2, out_features=2, bias=True) ) ```

buffers

``` python buffers(recurse=True) ``` Returns an iterator over module buffers. #### Args: recurse (bool): if True, then yields buffers of this module and all submodules. Otherwise, yields only buffers that are direct members of this module. #### Yields: * `torch.Tensor`: module buffer Example:: ``` >>> for buf in model.buffers(): >>> print(type(buf.data), buf.size()) (20L,) (20L, 1L, 5L, 5L) ```

children

``` python children() ``` Returns an iterator over immediate children modules. #### Yields: * `Module`: a child module

cpu

``` python cpu() ``` Moves all model parameters and buffers to the CPU. #### Returns: * `Module`: self

cuda

``` python cuda(device=None) ``` Moves all model parameters and buffers to the GPU. This also makes associated parameters and buffers different objects. So it should be called before constructing optimizer if the module will live on GPU while being optimized. #### Arguments: device (int, optional): if specified, all parameters will be copied to that device #### Returns: * `Module`: self

double

``` python double() ``` Casts all floating point parameters and buffers to ``double`` datatype. #### Returns: * `Module`: self

eval

``` python eval() ``` Sets the module in evaluation mode. This has any effect only on certain modules. See documentations of particular modules for details of their behaviors in training/evaluation mode, if they are affected, e.g. :class:`Dropout`, :class:`BatchNorm`, etc. This is equivalent with :meth:`self.train(False) `. #### Returns: * `Module`: self

extra_repr

``` python extra_repr() ``` Set the extra representation of the module To print customized extra information, you should reimplement this method in your own modules. Both single-line and multi-line strings are acceptable.

float

``` python float() ``` Casts all floating point parameters and buffers to float datatype. #### Returns: * `Module`: self

forward

``` python forward( input, state=None, lengths=None ) ``` Runs a forward pass of the GRU layer. #### Arguments: * `input`: Tensor, a batch of input sequences to pass through the GRU. Dimensions (seq_len, batch_size, input_size) if `batch_first` is `False`, otherwise (batch_size, seq_len, input_size). * `state`: (optional) Tensor, the intial state for each batch element in `input`. Dimensions (1, batch_size, hidden_size). Defaults to zeros. * `lengths`: (optional) Tensor, list of sequence lengths for each batch element. Dimension (batch_size). This argument may be omitted if all batch elements are unpadded and have the same sequence length. #### Returns: * `output`: Tensor, the output of the GRU layer. Dimensions (seq_len, batch_size, hidden_size) if `batch_first` is `False` (default) or (batch_size, seq_len, hidden_size) if `batch_first` is `True`. Note that if `lengths` was specified, the `output` tensor will not be masked. It's the caller's responsibility to either not use the invalid entries or to mask them out before using them. * `h_n`: the hidden state for the last sequence item. Dimensions (1, batch_size, hidden_size).

half

``` python half() ``` Casts all floating point parameters and buffers to ``half`` datatype. #### Returns: * `Module`: self

load_state_dict

``` python load_state_dict( state_dict, strict=True ) ``` Copies parameters and buffers from :attr:`state_dict` into this module and its descendants. If :attr:`strict` is ``True``, then the keys of :attr:`state_dict` must exactly match the keys returned by this module's :meth:`~torch.nn.Module.state_dict` function. #### Arguments: state_dict (dict): a dict containing parameters and persistent buffers. strict (bool, optional): whether to strictly enforce that the keys in :attr:`state_dict` match the keys returned by this module's :meth:`~torch.nn.Module.state_dict` function. Default: ``True`` #### Returns: ``NamedTuple`` with ``missing_keys`` and ``unexpected_keys`` fields: * **missing_keys** is a list of str containing the missing keys * **unexpected_keys** is a list of str containing the unexpected keys

modules

``` python modules() ``` Returns an iterator over all modules in the network. #### Yields: * `Module`: a module in the network #### Note: Duplicate modules are returned only once. In the following example, ``l`` will be returned only once. Example:: ``` >>> l = nn.Linear(2, 2) >>> net = nn.Sequential(l, l) >>> for idx, m in enumerate(net.modules()): print(idx, '->', m) ``` 0 -> Sequential( (0): Linear(in_features=2, out_features=2, bias=True) (1): Linear(in_features=2, out_features=2, bias=True) ) 1 -> Linear(in_features=2, out_features=2, bias=True)

named_buffers

``` python named_buffers( prefix='', recurse=True ) ``` Returns an iterator over module buffers, yielding both the name of the buffer as well as the buffer itself. #### Args: prefix (str): prefix to prepend to all buffer names. recurse (bool): if True, then yields buffers of this module and all submodules. Otherwise, yields only buffers that are direct members of this module. #### Yields: * `(string, torch.Tensor)`: Tuple containing the name and buffer Example:: ``` >>> for name, buf in self.named_buffers(): >>> if name in ['running_var']: >>> print(buf.size()) ```

named_children

``` python named_children() ``` Returns an iterator over immediate children modules, yielding both the name of the module as well as the module itself. #### Yields: * `(string, Module)`: Tuple containing a name and child module Example:: ``` >>> for name, module in model.named_children(): >>> if name in ['conv4', 'conv5']: >>> print(module) ```

named_modules

``` python named_modules( memo=None, prefix='' ) ``` Returns an iterator over all modules in the network, yielding both the name of the module as well as the module itself. #### Yields: * `(string, Module)`: Tuple of name and module #### Note: Duplicate modules are returned only once. In the following example, ``l`` will be returned only once. Example:: ``` >>> l = nn.Linear(2, 2) >>> net = nn.Sequential(l, l) >>> for idx, m in enumerate(net.named_modules()): print(idx, '->', m) ``` 0 -> ('', Sequential( (0): Linear(in_features=2, out_features=2, bias=True) (1): Linear(in_features=2, out_features=2, bias=True) )) 1 -> ('0', Linear(in_features=2, out_features=2, bias=True))

named_parameters

``` python named_parameters( prefix='', recurse=True ) ``` Returns an iterator over module parameters, yielding both the name of the parameter as well as the parameter itself. #### Args: prefix (str): prefix to prepend to all parameter names. recurse (bool): if True, then yields parameters of this module and all submodules. Otherwise, yields only parameters that are direct members of this module. #### Yields: * `(string, Parameter)`: Tuple containing the name and parameter Example:: ``` >>> for name, param in self.named_parameters(): >>> if name in ['bias']: >>> print(param.size()) ```

parameters

``` python parameters(recurse=True) ``` Returns an iterator over module parameters. This is typically passed to an optimizer. #### Args: recurse (bool): if True, then yields parameters of this module and all submodules. Otherwise, yields only parameters that are direct members of this module. #### Yields: * `Parameter`: module parameter Example:: ``` >>> for param in model.parameters(): >>> print(type(param.data), param.size()) (20L,) (20L, 1L, 5L, 5L) ```

register_backward_hook

``` python register_backward_hook(hook) ``` Registers a backward hook on the module. The hook will be called every time the gradients with respect to module inputs are computed. The hook should have the following signature:: hook(module, grad_input, grad_output) -> Tensor or None The :attr:`grad_input` and :attr:`grad_output` may be tuples if the module has multiple inputs or outputs. The hook should not modify its arguments, but it can optionally return a new gradient with respect to input that will be used in place of :attr:`grad_input` in subsequent computations. #### Returns: :class:`torch.utils.hooks.RemovableHandle`: a handle that can be used to remove the added hook by calling ``handle.remove()`` .. warning :: The current implementation will not have the presented behavior for complex :class:`Module` that perform many operations. In some failure cases, :attr:`grad_input` and :attr:`grad_output` will only contain the gradients for a subset of the inputs and outputs. For such :class:`Module`, you should use :func:`torch.Tensor.register_hook` directly on a specific input or output to get the required gradients.

register_buffer

``` python register_buffer( name, tensor ) ``` Adds a persistent buffer to the module. This is typically used to register a buffer that should not to be considered a model parameter. For example, BatchNorm's ``running_mean`` is not a parameter, but is part of the persistent state. Buffers can be accessed as attributes using given names. #### Args: name (string): name of the buffer. The buffer can be accessed from this module using the given name tensor (Tensor): buffer to be registered. Example:: ``` >>> self.register_buffer('running_mean', torch.zeros(num_features)) ```

register_forward_hook

``` python register_forward_hook(hook) ``` Registers a forward hook on the module. The hook will be called every time after :func:`forward` has computed an output. It should have the following signature:: hook(module, input, output) -> None or modified output The hook can modify the output. It can modify the input inplace but it will not have effect on forward since this is called after :func:`forward` is called. #### Returns: :class:`torch.utils.hooks.RemovableHandle`: a handle that can be used to remove the added hook by calling ``handle.remove()``

register_forward_pre_hook

``` python register_forward_pre_hook(hook) ``` Registers a forward pre-hook on the module. The hook will be called every time before :func:`forward` is invoked. It should have the following signature:: hook(module, input) -> None or modified input The hook can modify the input. User can either return a tuple or a single modified value in the hook. We will wrap the value into a tuple if a single value is returned(unless that value is already a tuple). #### Returns: :class:`torch.utils.hooks.RemovableHandle`: a handle that can be used to remove the added hook by calling ``handle.remove()``

register_parameter

``` python register_parameter( name, param ) ``` Adds a parameter to the module. The parameter can be accessed as an attribute using given name. #### Args: name (string): name of the parameter. The parameter can be accessed from this module using the given name param (Parameter): parameter to be added to the module.

requires_grad_

``` python requires_grad_(requires_grad=True) ``` Change if autograd should record operations on parameters in this module. This method sets the parameters' :attr:`requires_grad` attributes in-place. This method is helpful for freezing part of the module for finetuning or training parts of a model individually (e.g., GAN training). #### Args: requires_grad (bool): whether autograd should record operations on parameters in this module. Default: ``True``. #### Returns: * `Module`: self

reset_parameters

``` python reset_parameters() ``` Resets this layer's parameters to their initial values.

share_memory

``` python share_memory() ```

state_dict

``` python state_dict( destination=None, prefix='', keep_vars=False ) ``` Returns a dictionary containing a whole state of the module. Both parameters and persistent buffers (e.g. running averages) are included. Keys are corresponding parameter and buffer names. #### Returns: * `dict`: a dictionary containing a whole state of the module Example:: ``` >>> module.state_dict().keys() ['bias', 'weight'] ```

to

``` python to( *args, **kwargs ) ``` Moves and/or casts the parameters and buffers. This can be called as .. function:: to(device=None, dtype=None, non_blocking=False) .. function:: to(dtype, non_blocking=False) .. function:: to(tensor, non_blocking=False) Its signature is similar to :meth:`torch.Tensor.to`, but only accepts floating point desired :attr:`dtype` s. In addition, this method will only cast the floating point parameters and buffers to :attr:`dtype` (if given). The integral parameters and buffers will be moved :attr:`device`, if that is given, but with dtypes unchanged. When :attr:`non_blocking` is set, it tries to convert/move asynchronously with respect to the host if possible, e.g., moving CPU Tensors with pinned memory to CUDA devices. See below for examples. .. note:: This method modifies the module in-place. #### Args: device (:class:`torch.device`): the desired device of the parameters and buffers in this module dtype (:class:`torch.dtype`): the desired floating point type of the floating point parameters and buffers in this module tensor (torch.Tensor): Tensor whose dtype and device are the desired dtype and device for all parameters and buffers in this module #### Returns: * `Module`: self Example:: ``` >>> linear = nn.Linear(2, 2) >>> linear.weight Parameter containing: tensor([[ 0.1913, -0.3420], [-0.5113, -0.2325]]) >>> linear.to(torch.double) Linear(in_features=2, out_features=2, bias=True) >>> linear.weight Parameter containing: tensor([[ 0.1913, -0.3420], [-0.5113, -0.2325]], dtype=torch.float64) >>> gpu1 = torch.device("cuda:1") >>> linear.to(gpu1, dtype=torch.half, non_blocking=True) Linear(in_features=2, out_features=2, bias=True) >>> linear.weight Parameter containing: tensor([[ 0.1914, -0.3420], [-0.5112, -0.2324]], dtype=torch.float16, device='cuda:1') >>> cpu = torch.device("cpu") >>> linear.to(cpu) Linear(in_features=2, out_features=2, bias=True) >>> linear.weight Parameter containing: tensor([[ 0.1914, -0.3420], [-0.5112, -0.2324]], dtype=torch.float16) ```

train

``` python train(mode=True) ``` Sets the module in training mode. This has any effect only on certain modules. See documentations of particular modules for details of their behaviors in training/evaluation mode, if they are affected, e.g. :class:`Dropout`, :class:`BatchNorm`, etc. #### Args: mode (bool): whether to set training mode (``True``) or evaluation mode (``False``). Default: ``True``. #### Returns: * `Module`: self

type

``` python type(dst_type) ``` Casts all parameters and buffers to :attr:`dst_type`. #### Arguments: dst_type (type or string): the desired type #### Returns: * `Module`: self

zero_grad

``` python zero_grad() ``` Sets gradients of all model parameters to zero. ================================================ FILE: docs/pytorch/haste_pytorch/LayerNormLSTM.md ================================================
# haste_pytorch.LayerNormLSTM ## Class `LayerNormLSTM` Layer Normalized Long Short-Term Memory layer. This LSTM layer applies layer normalization to the input, recurrent, and output activations of a standard LSTM. The implementation is fused and GPU-accelerated. DropConnect and Zoneout regularization are built-in, and this layer allows setting a non-zero initial forget gate bias. Details about the exact function this layer implements can be found at https://github.com/lmnt-com/haste/issues/1. See [\_\_init\_\_](#__init__) and [forward](#forward) for usage.

__init__

``` python __init__( input_size, hidden_size, batch_first=False, forget_bias=1.0, dropout=0.0, zoneout=0.0 ) ``` Initialize the parameters of the LSTM layer. #### Arguments: * `input_size`: int, the feature dimension of the input. * `hidden_size`: int, the feature dimension of the output. * `batch_first`: (optional) bool, if `True`, then the input and output tensors are provided as `(batch, seq, feature)`. * `forget_bias`: (optional) float, sets the initial bias of the forget gate for this LSTM cell. * `dropout`: (optional) float, sets the dropout rate for DropConnect regularization on the recurrent matrix. * `zoneout`: (optional) float, sets the zoneout rate for Zoneout regularization. #### Variables: * `kernel`: the input projection weight matrix. Dimensions (input_size, hidden_size * 4) with `i,g,f,o` gate layout. Initialized with Xavier uniform initialization. * `recurrent_kernel`: the recurrent projection weight matrix. Dimensions (hidden_size, hidden_size * 4) with `i,g,f,o` gate layout. Initialized with orthogonal initialization. * `bias`: the projection bias vector. Dimensions (hidden_size * 4) with `i,g,f,o` gate layout. The forget gate biases are initialized to `forget_bias` and the rest are zeros. * `gamma`: the input and recurrent normalization gain. Dimensions (2, hidden_size * 4) with `gamma[0]` specifying the input gain and `gamma[1]` specifying the recurrent gain. Initialized to ones. * `gamma_h`: the output normalization gain. Dimensions (hidden_size). Initialized to ones. * `beta_h`: the output normalization bias. Dimensions (hidden_size). Initialized to zeros. ## Methods

__call__

``` python __call__( *input, **kwargs ) ``` Call self as a function.

add_module

``` python add_module( name, module ) ``` Adds a child module to the current module. The module can be accessed as an attribute using the given name. #### Args: name (string): name of the child module. The child module can be accessed from this module using the given name module (Module): child module to be added to the module.

apply

``` python apply(fn) ``` Applies ``fn`` recursively to every submodule (as returned by ``.children()``) as well as self. Typical use includes initializing the parameters of a model (see also :ref:`nn-init-doc`). #### Args: fn (:class:`Module` -> None): function to be applied to each submodule #### Returns: * `Module`: self Example:: ``` >>> def init_weights(m): >>> print(m) >>> if type(m) == nn.Linear: >>> m.weight.data.fill_(1.0) >>> print(m.weight) >>> net = nn.Sequential(nn.Linear(2, 2), nn.Linear(2, 2)) >>> net.apply(init_weights) Linear(in_features=2, out_features=2, bias=True) Parameter containing: tensor([[ 1., 1.], [ 1., 1.]]) Linear(in_features=2, out_features=2, bias=True) Parameter containing: tensor([[ 1., 1.], [ 1., 1.]]) Sequential( (0): Linear(in_features=2, out_features=2, bias=True) (1): Linear(in_features=2, out_features=2, bias=True) ) Sequential( (0): Linear(in_features=2, out_features=2, bias=True) (1): Linear(in_features=2, out_features=2, bias=True) ) ```

buffers

``` python buffers(recurse=True) ``` Returns an iterator over module buffers. #### Args: recurse (bool): if True, then yields buffers of this module and all submodules. Otherwise, yields only buffers that are direct members of this module. #### Yields: * `torch.Tensor`: module buffer Example:: ``` >>> for buf in model.buffers(): >>> print(type(buf.data), buf.size()) (20L,) (20L, 1L, 5L, 5L) ```

children

``` python children() ``` Returns an iterator over immediate children modules. #### Yields: * `Module`: a child module

cpu

``` python cpu() ``` Moves all model parameters and buffers to the CPU. #### Returns: * `Module`: self

cuda

``` python cuda(device=None) ``` Moves all model parameters and buffers to the GPU. This also makes associated parameters and buffers different objects. So it should be called before constructing optimizer if the module will live on GPU while being optimized. #### Arguments: device (int, optional): if specified, all parameters will be copied to that device #### Returns: * `Module`: self

double

``` python double() ``` Casts all floating point parameters and buffers to ``double`` datatype. #### Returns: * `Module`: self

eval

``` python eval() ``` Sets the module in evaluation mode. This has any effect only on certain modules. See documentations of particular modules for details of their behaviors in training/evaluation mode, if they are affected, e.g. :class:`Dropout`, :class:`BatchNorm`, etc. This is equivalent with :meth:`self.train(False) `. #### Returns: * `Module`: self

extra_repr

``` python extra_repr() ``` Set the extra representation of the module To print customized extra information, you should reimplement this method in your own modules. Both single-line and multi-line strings are acceptable.

float

``` python float() ``` Casts all floating point parameters and buffers to float datatype. #### Returns: * `Module`: self

forward

``` python forward( input, state=None, lengths=None ) ``` Runs a forward pass of the LSTM layer. #### Arguments: * `input`: Tensor, a batch of input sequences to pass through the LSTM. Dimensions (seq_len, batch_size, input_size) if `batch_first` is `False`, otherwise (batch_size, seq_len, input_size). * `lengths`: (optional) Tensor, list of sequence lengths for each batch element. Dimension (batch_size). This argument may be omitted if all batch elements are unpadded and have the same sequence length. #### Returns: * `output`: Tensor, the output of the LSTM layer. Dimensions (seq_len, batch_size, hidden_size) if `batch_first` is `False` (default) or (batch_size, seq_len, hidden_size) if `batch_first` is `True`. Note that if `lengths` was specified, the `output` tensor will not be masked. It's the caller's responsibility to either not use the invalid entries or to mask them out before using them. * `(h_n, c_n)`: the hidden and cell states, respectively, for the last sequence item. Dimensions (1, batch_size, hidden_size).

half

``` python half() ``` Casts all floating point parameters and buffers to ``half`` datatype. #### Returns: * `Module`: self

load_state_dict

``` python load_state_dict( state_dict, strict=True ) ``` Copies parameters and buffers from :attr:`state_dict` into this module and its descendants. If :attr:`strict` is ``True``, then the keys of :attr:`state_dict` must exactly match the keys returned by this module's :meth:`~torch.nn.Module.state_dict` function. #### Arguments: state_dict (dict): a dict containing parameters and persistent buffers. strict (bool, optional): whether to strictly enforce that the keys in :attr:`state_dict` match the keys returned by this module's :meth:`~torch.nn.Module.state_dict` function. Default: ``True`` #### Returns: ``NamedTuple`` with ``missing_keys`` and ``unexpected_keys`` fields: * **missing_keys** is a list of str containing the missing keys * **unexpected_keys** is a list of str containing the unexpected keys

modules

``` python modules() ``` Returns an iterator over all modules in the network. #### Yields: * `Module`: a module in the network #### Note: Duplicate modules are returned only once. In the following example, ``l`` will be returned only once. Example:: ``` >>> l = nn.Linear(2, 2) >>> net = nn.Sequential(l, l) >>> for idx, m in enumerate(net.modules()): print(idx, '->', m) ``` 0 -> Sequential( (0): Linear(in_features=2, out_features=2, bias=True) (1): Linear(in_features=2, out_features=2, bias=True) ) 1 -> Linear(in_features=2, out_features=2, bias=True)

named_buffers

``` python named_buffers( prefix='', recurse=True ) ``` Returns an iterator over module buffers, yielding both the name of the buffer as well as the buffer itself. #### Args: prefix (str): prefix to prepend to all buffer names. recurse (bool): if True, then yields buffers of this module and all submodules. Otherwise, yields only buffers that are direct members of this module. #### Yields: * `(string, torch.Tensor)`: Tuple containing the name and buffer Example:: ``` >>> for name, buf in self.named_buffers(): >>> if name in ['running_var']: >>> print(buf.size()) ```

named_children

``` python named_children() ``` Returns an iterator over immediate children modules, yielding both the name of the module as well as the module itself. #### Yields: * `(string, Module)`: Tuple containing a name and child module Example:: ``` >>> for name, module in model.named_children(): >>> if name in ['conv4', 'conv5']: >>> print(module) ```

named_modules

``` python named_modules( memo=None, prefix='' ) ``` Returns an iterator over all modules in the network, yielding both the name of the module as well as the module itself. #### Yields: * `(string, Module)`: Tuple of name and module #### Note: Duplicate modules are returned only once. In the following example, ``l`` will be returned only once. Example:: ``` >>> l = nn.Linear(2, 2) >>> net = nn.Sequential(l, l) >>> for idx, m in enumerate(net.named_modules()): print(idx, '->', m) ``` 0 -> ('', Sequential( (0): Linear(in_features=2, out_features=2, bias=True) (1): Linear(in_features=2, out_features=2, bias=True) )) 1 -> ('0', Linear(in_features=2, out_features=2, bias=True))

named_parameters

``` python named_parameters( prefix='', recurse=True ) ``` Returns an iterator over module parameters, yielding both the name of the parameter as well as the parameter itself. #### Args: prefix (str): prefix to prepend to all parameter names. recurse (bool): if True, then yields parameters of this module and all submodules. Otherwise, yields only parameters that are direct members of this module. #### Yields: * `(string, Parameter)`: Tuple containing the name and parameter Example:: ``` >>> for name, param in self.named_parameters(): >>> if name in ['bias']: >>> print(param.size()) ```

parameters

``` python parameters(recurse=True) ``` Returns an iterator over module parameters. This is typically passed to an optimizer. #### Args: recurse (bool): if True, then yields parameters of this module and all submodules. Otherwise, yields only parameters that are direct members of this module. #### Yields: * `Parameter`: module parameter Example:: ``` >>> for param in model.parameters(): >>> print(type(param.data), param.size()) (20L,) (20L, 1L, 5L, 5L) ```

register_backward_hook

``` python register_backward_hook(hook) ``` Registers a backward hook on the module. The hook will be called every time the gradients with respect to module inputs are computed. The hook should have the following signature:: hook(module, grad_input, grad_output) -> Tensor or None The :attr:`grad_input` and :attr:`grad_output` may be tuples if the module has multiple inputs or outputs. The hook should not modify its arguments, but it can optionally return a new gradient with respect to input that will be used in place of :attr:`grad_input` in subsequent computations. #### Returns: :class:`torch.utils.hooks.RemovableHandle`: a handle that can be used to remove the added hook by calling ``handle.remove()`` .. warning :: The current implementation will not have the presented behavior for complex :class:`Module` that perform many operations. In some failure cases, :attr:`grad_input` and :attr:`grad_output` will only contain the gradients for a subset of the inputs and outputs. For such :class:`Module`, you should use :func:`torch.Tensor.register_hook` directly on a specific input or output to get the required gradients.

register_buffer

``` python register_buffer( name, tensor ) ``` Adds a persistent buffer to the module. This is typically used to register a buffer that should not to be considered a model parameter. For example, BatchNorm's ``running_mean`` is not a parameter, but is part of the persistent state. Buffers can be accessed as attributes using given names. #### Args: name (string): name of the buffer. The buffer can be accessed from this module using the given name tensor (Tensor): buffer to be registered. Example:: ``` >>> self.register_buffer('running_mean', torch.zeros(num_features)) ```

register_forward_hook

``` python register_forward_hook(hook) ``` Registers a forward hook on the module. The hook will be called every time after :func:`forward` has computed an output. It should have the following signature:: hook(module, input, output) -> None or modified output The hook can modify the output. It can modify the input inplace but it will not have effect on forward since this is called after :func:`forward` is called. #### Returns: :class:`torch.utils.hooks.RemovableHandle`: a handle that can be used to remove the added hook by calling ``handle.remove()``

register_forward_pre_hook

``` python register_forward_pre_hook(hook) ``` Registers a forward pre-hook on the module. The hook will be called every time before :func:`forward` is invoked. It should have the following signature:: hook(module, input) -> None or modified input The hook can modify the input. User can either return a tuple or a single modified value in the hook. We will wrap the value into a tuple if a single value is returned(unless that value is already a tuple). #### Returns: :class:`torch.utils.hooks.RemovableHandle`: a handle that can be used to remove the added hook by calling ``handle.remove()``

register_parameter

``` python register_parameter( name, param ) ``` Adds a parameter to the module. The parameter can be accessed as an attribute using given name. #### Args: name (string): name of the parameter. The parameter can be accessed from this module using the given name param (Parameter): parameter to be added to the module.

requires_grad_

``` python requires_grad_(requires_grad=True) ``` Change if autograd should record operations on parameters in this module. This method sets the parameters' :attr:`requires_grad` attributes in-place. This method is helpful for freezing part of the module for finetuning or training parts of a model individually (e.g., GAN training). #### Args: requires_grad (bool): whether autograd should record operations on parameters in this module. Default: ``True``. #### Returns: * `Module`: self

reset_parameters

``` python reset_parameters() ``` Resets this layer's parameters to their initial values.

share_memory

``` python share_memory() ```

state_dict

``` python state_dict( destination=None, prefix='', keep_vars=False ) ``` Returns a dictionary containing a whole state of the module. Both parameters and persistent buffers (e.g. running averages) are included. Keys are corresponding parameter and buffer names. #### Returns: * `dict`: a dictionary containing a whole state of the module Example:: ``` >>> module.state_dict().keys() ['bias', 'weight'] ```

to

``` python to( *args, **kwargs ) ``` Moves and/or casts the parameters and buffers. This can be called as .. function:: to(device=None, dtype=None, non_blocking=False) .. function:: to(dtype, non_blocking=False) .. function:: to(tensor, non_blocking=False) Its signature is similar to :meth:`torch.Tensor.to`, but only accepts floating point desired :attr:`dtype` s. In addition, this method will only cast the floating point parameters and buffers to :attr:`dtype` (if given). The integral parameters and buffers will be moved :attr:`device`, if that is given, but with dtypes unchanged. When :attr:`non_blocking` is set, it tries to convert/move asynchronously with respect to the host if possible, e.g., moving CPU Tensors with pinned memory to CUDA devices. See below for examples. .. note:: This method modifies the module in-place. #### Args: device (:class:`torch.device`): the desired device of the parameters and buffers in this module dtype (:class:`torch.dtype`): the desired floating point type of the floating point parameters and buffers in this module tensor (torch.Tensor): Tensor whose dtype and device are the desired dtype and device for all parameters and buffers in this module #### Returns: * `Module`: self Example:: ``` >>> linear = nn.Linear(2, 2) >>> linear.weight Parameter containing: tensor([[ 0.1913, -0.3420], [-0.5113, -0.2325]]) >>> linear.to(torch.double) Linear(in_features=2, out_features=2, bias=True) >>> linear.weight Parameter containing: tensor([[ 0.1913, -0.3420], [-0.5113, -0.2325]], dtype=torch.float64) >>> gpu1 = torch.device("cuda:1") >>> linear.to(gpu1, dtype=torch.half, non_blocking=True) Linear(in_features=2, out_features=2, bias=True) >>> linear.weight Parameter containing: tensor([[ 0.1914, -0.3420], [-0.5112, -0.2324]], dtype=torch.float16, device='cuda:1') >>> cpu = torch.device("cpu") >>> linear.to(cpu) Linear(in_features=2, out_features=2, bias=True) >>> linear.weight Parameter containing: tensor([[ 0.1914, -0.3420], [-0.5112, -0.2324]], dtype=torch.float16) ```

train

``` python train(mode=True) ``` Sets the module in training mode. This has any effect only on certain modules. See documentations of particular modules for details of their behaviors in training/evaluation mode, if they are affected, e.g. :class:`Dropout`, :class:`BatchNorm`, etc. #### Args: mode (bool): whether to set training mode (``True``) or evaluation mode (``False``). Default: ``True``. #### Returns: * `Module`: self

type

``` python type(dst_type) ``` Casts all parameters and buffers to :attr:`dst_type`. #### Arguments: dst_type (type or string): the desired type #### Returns: * `Module`: self

zero_grad

``` python zero_grad() ``` Sets gradients of all model parameters to zero. ================================================ FILE: docs/pytorch/haste_pytorch.md ================================================
# Module: haste_pytorch Haste: a fast, simple, and open RNN library. ## Classes [`class GRU`](./haste_pytorch/GRU.md): Gated Recurrent Unit layer. [`class IndRNN`](./haste_pytorch/IndRNN.md): Independently Recurrent Neural Network layer. [`class LSTM`](./haste_pytorch/LSTM.md): Long Short-Term Memory layer. [`class LayerNormGRU`](./haste_pytorch/LayerNormGRU.md): Layer Normalized Gated Recurrent Unit layer. [`class LayerNormLSTM`](./haste_pytorch/LayerNormLSTM.md): Layer Normalized Long Short-Term Memory layer. ================================================ FILE: docs/tf/haste_tf/GRU.md ================================================
# haste_tf.GRU ## Class `GRU` Gated Recurrent Unit layer. This GRU layer offers a fused, GPU-accelerated TensorFlow op for inference and training. There are two commonly-used variants of GRU cells. This one implements 1406.1078v1 which applies the reset gate to the hidden state after matrix multiplication. cuDNN also implements this variant. The other variant, 1406.1078v3, applies the reset gate before matrix multiplication and is currently unsupported. This layer has built-in support for DropConnect and Zoneout, which are both techniques used to regularize RNNs.

__init__

``` python __init__( num_units, direction='unidirectional', **kwargs ) ``` Initialize the parameters of the GRU layer. #### Arguments: * `num_units`: int, the number of units in the LSTM cell. * `direction`: string, 'unidirectional' or 'bidirectional'. * `**kwargs`: Dict, keyword arguments (see below). #### Keyword Arguments: * `kernel_initializer`: (optional) the initializer to use for the input matrix weights. Defaults to `glorot_uniform`. * `recurrent_initializer`: (optional) the initializer to use for the recurrent matrix weights. Defaults to `orthogonal`. * `bias_initializer`: (optional) the initializer to use for input bias vectors. Defaults to `zeros`. * `recurrent_bias_initializer`: (optional) the initializer to use for recurrent bias vectors. Defaults to `zeros`. * `kernel_transform`: (optional) a function with signature `(kernel: Tensor) -> Tensor` that transforms the kernel before it is used. Defaults to the identity function. * `recurrent_transform`: (optional) a function with signature `(recurrent_kernel: Tensor) -> Tensor` that transforms the recurrent kernel before it is used. Defaults to the identity function. * `bias_transform`: (optional) a function with signature `(bias: Tensor) -> Tensor` that transforms the bias before it is used. Defaults to the identity function. * `recurrent_bias_transform`: (optional) a function with signature `(recurrent_bias: Tensor) -> Tensor` that transforms the recurrent bias before it is used. Defaults to the identity function. * `dropout`: (optional) float, sets the dropout rate for DropConnect regularization on the recurrent matrix. Defaults to 0. * `zoneout`: (optional) float, sets the zoneout rate for Zoneout regularization. Defaults to 0. * `dtype`: (optional) the data type for this layer. Defaults to `tf.float32`. * `name`: (optional) string, the name for this layer. ## Properties

bidirectional

`True` if this is a bidirectional RNN, `False` otherwise.

name

Returns the name of this module as passed or determined in the ctor. NOTE: This is not the same as the `self.name_scope.name` which includes parent module names.

name_scope

Returns a `tf.name_scope` instance for this class.

output_size

state_size

submodules

Sequence of all sub-modules. Submodules are modules which are properties of this module, or found as properties of modules which are properties of this module (and so on). ``` a = tf.Module() b = tf.Module() c = tf.Module() a.b = b b.c = c assert list(a.submodules) == [b, c] assert list(b.submodules) == [c] assert list(c.submodules) == [] ``` #### Returns: A sequence of all submodules.

trainable_variables

Sequence of variables owned by this module and it's submodules. Note: this method uses reflection to find variables on the current instance and submodules. For performance reasons you may wish to cache the result of calling this method if you don't expect the return value to change. #### Returns: A sequence of variables for the current module (sorted by attribute name) followed by variables from all submodules recursively (breadth first).

variables

Sequence of variables owned by this module and it's submodules. Note: this method uses reflection to find variables on the current instance and submodules. For performance reasons you may wish to cache the result of calling this method if you don't expect the return value to change. #### Returns: A sequence of variables for the current module (sorted by attribute name) followed by variables from all submodules recursively (breadth first). ## Methods

__call__

``` python __call__( inputs, training, sequence_length=None, time_major=False ) ``` Runs the RNN layer. #### Arguments: * `inputs`: Tensor, a rank 3 input tensor with shape [N,T,C] if `time_major` is `False`, or with shape [T,N,C] if `time_major` is `True`. * `training`: bool, `True` if running in training mode, `False` if running in inference mode. * `sequence_length`: (optional) Tensor, a rank 1 tensor with shape [N] and dtype of `tf.int32` or `tf.int64`. This tensor specifies the unpadded length of each example in the input minibatch. * `time_major`: (optional) bool, specifies whether `input` has shape [N,T,C] (`time_major=False`) or shape [T,N,C] (`time_major=True`). #### Returns: A pair, `(output, state)` for unidirectional layers, or a pair `([output_fw, output_bw], [state_fw, state_bw])` for bidirectional layers.

build

``` python build(shape) ``` Creates the variables of the layer. Calling this method is optional for users of the RNN class. It is called internally with the correct shape when `__call__` is invoked. #### Arguments: * `shape`: instance of `TensorShape`.

with_name_scope

``` python @classmethod with_name_scope( cls, method ) ``` Decorator to automatically enter the module name scope. ``` class MyModule(tf.Module): @tf.Module.with_name_scope def __call__(self, x): if not hasattr(self, 'w'): self.w = tf.Variable(tf.random.normal([x.shape[1], 64])) return tf.matmul(x, self.w) ``` Using the above module would produce `tf.Variable`s and `tf.Tensor`s whose names included the module name: ``` mod = MyModule() mod(tf.ones([8, 32])) # ==> mod.w # ==> ``` #### Args: * `method`: The method to wrap. #### Returns: The original method wrapped such that it enters the module's name scope. ================================================ FILE: docs/tf/haste_tf/GRUCell.md ================================================
# haste_tf.GRUCell ## Class `GRUCell` A GRU cell that's compatible with the Haste GRU layer. This cell can be used on hardware other than GPUs and with other TensorFlow classes that operate on RNN cells (e.g. `dynamic_rnn`, `BasicDecoder`, cell wrappers, etc.).

__init__

``` python __init__( num_units, name=None, **kwargs ) ``` ## Properties

activity_regularizer

Optional regularizer function for the output of this layer.

dtype

dynamic

graph

DEPRECATED FUNCTION Warning: THIS FUNCTION IS DEPRECATED. It will be removed in a future version. Instructions for updating: Stop using this property because tf.layers layers no longer track their graph.

input

Retrieves the input tensor(s) of a layer. Only applicable if the layer has exactly one input, i.e. if it is connected to one incoming layer. #### Returns: Input tensor or list of input tensors. #### Raises: * `AttributeError`: if the layer is connected to more than one incoming layers. #### Raises: * `RuntimeError`: If called in Eager mode. * `AttributeError`: If no inbound nodes are found.

input_mask

Retrieves the input mask tensor(s) of a layer. Only applicable if the layer has exactly one inbound node, i.e. if it is connected to one incoming layer. #### Returns: Input mask tensor (potentially None) or list of input mask tensors. #### Raises: * `AttributeError`: if the layer is connected to more than one incoming layers.

input_shape

Retrieves the input shape(s) of a layer. Only applicable if the layer has exactly one input, i.e. if it is connected to one incoming layer, or if all inputs have the same shape. #### Returns: Input shape, as an integer shape tuple (or list of shape tuples, one tuple per input tensor). #### Raises: * `AttributeError`: if the layer has no defined input_shape. * `RuntimeError`: if called in Eager mode.

losses

Losses which are associated with this `Layer`. Variable regularization tensors are created when this property is accessed, so it is eager safe: accessing `losses` under a `tf.GradientTape` will propagate gradients back to the corresponding variables. #### Returns: A list of tensors.

metrics

name

Returns the name of this module as passed or determined in the ctor. NOTE: This is not the same as the `self.name_scope.name` which includes parent module names.

name_scope

Returns a `tf.name_scope` instance for this class.

non_trainable_variables

non_trainable_weights

output

Retrieves the output tensor(s) of a layer. Only applicable if the layer has exactly one output, i.e. if it is connected to one incoming layer. #### Returns: Output tensor or list of output tensors. #### Raises: * `AttributeError`: if the layer is connected to more than one incoming layers. * `RuntimeError`: if called in Eager mode.

output_mask

Retrieves the output mask tensor(s) of a layer. Only applicable if the layer has exactly one inbound node, i.e. if it is connected to one incoming layer. #### Returns: Output mask tensor (potentially None) or list of output mask tensors. #### Raises: * `AttributeError`: if the layer is connected to more than one incoming layers.

output_shape

Retrieves the output shape(s) of a layer. Only applicable if the layer has one output, or if all outputs have the same shape. #### Returns: Output shape, as an integer shape tuple (or list of shape tuples, one tuple per output tensor). #### Raises: * `AttributeError`: if the layer has no defined output shape. * `RuntimeError`: if called in Eager mode.

output_size

Integer or TensorShape: size of outputs produced by this cell.

scope_name

state_size

size(s) of state(s) used by this cell. It can be represented by an Integer, a TensorShape or a tuple of Integers or TensorShapes.

submodules

Sequence of all sub-modules. Submodules are modules which are properties of this module, or found as properties of modules which are properties of this module (and so on). ``` a = tf.Module() b = tf.Module() c = tf.Module() a.b = b b.c = c assert list(a.submodules) == [b, c] assert list(b.submodules) == [c] assert list(c.submodules) == [] ``` #### Returns: A sequence of all submodules.

trainable

trainable_variables

Sequence of variables owned by this module and it's submodules. Note: this method uses reflection to find variables on the current instance and submodules. For performance reasons you may wish to cache the result of calling this method if you don't expect the return value to change. #### Returns: A sequence of variables for the current module (sorted by attribute name) followed by variables from all submodules recursively (breadth first).

trainable_weights

updates

variables

Returns the list of all layer variables/weights. Alias of `self.weights`. #### Returns: A list of variables.

weights

Returns the list of all layer variables/weights. #### Returns: A list of variables. ## Methods

__call__

``` python __call__( inputs, state, scope=None ) ``` Run this RNN cell on inputs, starting from the given state. #### Args: * `inputs`: `2-D` tensor with shape `[batch_size, input_size]`. * `state`: if `self.state_size` is an integer, this should be a `2-D Tensor` with shape `[batch_size, self.state_size]`. Otherwise, if `self.state_size` is a tuple of integers, this should be a tuple with shapes `[batch_size, s] for s in self.state_size`. * `scope`: VariableScope for the created subgraph; defaults to class name. #### Returns: * `A pair containing`: - Output: A `2-D` tensor with shape `[batch_size, self.output_size]`. - New state: Either a single `2-D` tensor, or a tuple of tensors matching the arity and shapes of `state`.

apply

``` python apply( inputs, *args, **kwargs ) ``` Apply the layer on a input. This is an alias of `self.__call__`. #### Arguments: * `inputs`: Input tensor(s). * `*args`: additional positional arguments to be passed to `self.call`. * `**kwargs`: additional keyword arguments to be passed to `self.call`. #### Returns: Output tensor(s).

build

``` python build(shape) ``` Creates the variables of the layer (optional, for subclass implementers). This is a method that implementers of subclasses of `Layer` or `Model` can override if they need a state-creation step in-between layer instantiation and layer call. This is typically used to create the weights of `Layer` subclasses. #### Arguments: * `input_shape`: Instance of `TensorShape`, or list of instances of `TensorShape` if the layer expects a list of inputs (one instance per input).

compute_mask

``` python compute_mask( inputs, mask=None ) ``` Computes an output mask tensor. #### Arguments: * `inputs`: Tensor or list of tensors. * `mask`: Tensor or list of tensors. #### Returns: None or a tensor (or list of tensors, one per output tensor of the layer).

compute_output_shape

``` python compute_output_shape(input_shape) ``` Computes the output shape of the layer. Assumes that the layer will be built to match that input shape provided. #### Arguments: * `input_shape`: Shape tuple (tuple of integers) or list of shape tuples (one per output tensor of the layer). Shape tuples can include None for free dimensions, instead of an integer. #### Returns: An input shape tuple.

count_params

``` python count_params() ``` Count the total number of scalars composing the weights. #### Returns: An integer count. #### Raises: * `ValueError`: if the layer isn't yet built (in which case its weights aren't yet defined).

from_config

``` python @classmethod from_config( cls, config ) ``` Creates a layer from its config. This method is the reverse of `get_config`, capable of instantiating the same layer from the config dictionary. It does not handle layer connectivity (handled by Network), nor weights (handled by `set_weights`). #### Arguments: * `config`: A Python dictionary, typically the output of get_config. #### Returns: A layer instance.

get_config

``` python get_config() ``` Returns the config of the layer. A layer config is a Python dictionary (serializable) containing the configuration of a layer. The same layer can be reinstantiated later (without its trained weights) from this configuration. The config of a layer does not include connectivity information, nor the layer class name. These are handled by `Network` (one layer of abstraction above). #### Returns: Python dictionary.

get_initial_state

``` python get_initial_state( inputs=None, batch_size=None, dtype=None ) ```

get_input_at

``` python get_input_at(node_index) ``` Retrieves the input tensor(s) of a layer at a given node. #### Arguments: * `node_index`: Integer, index of the node from which to retrieve the attribute. E.g. `node_index=0` will correspond to the first time the layer was called. #### Returns: A tensor (or list of tensors if the layer has multiple inputs). #### Raises: * `RuntimeError`: If called in Eager mode.

get_input_mask_at

``` python get_input_mask_at(node_index) ``` Retrieves the input mask tensor(s) of a layer at a given node. #### Arguments: * `node_index`: Integer, index of the node from which to retrieve the attribute. E.g. `node_index=0` will correspond to the first time the layer was called. #### Returns: A mask tensor (or list of tensors if the layer has multiple inputs).

get_input_shape_at

``` python get_input_shape_at(node_index) ``` Retrieves the input shape(s) of a layer at a given node. #### Arguments: * `node_index`: Integer, index of the node from which to retrieve the attribute. E.g. `node_index=0` will correspond to the first time the layer was called. #### Returns: A shape tuple (or list of shape tuples if the layer has multiple inputs). #### Raises: * `RuntimeError`: If called in Eager mode.

get_losses_for

``` python get_losses_for(inputs) ``` Retrieves losses relevant to a specific set of inputs. #### Arguments: * `inputs`: Input tensor or list/tuple of input tensors. #### Returns: List of loss tensors of the layer that depend on `inputs`.

get_output_at

``` python get_output_at(node_index) ``` Retrieves the output tensor(s) of a layer at a given node. #### Arguments: * `node_index`: Integer, index of the node from which to retrieve the attribute. E.g. `node_index=0` will correspond to the first time the layer was called. #### Returns: A tensor (or list of tensors if the layer has multiple outputs). #### Raises: * `RuntimeError`: If called in Eager mode.

get_output_mask_at

``` python get_output_mask_at(node_index) ``` Retrieves the output mask tensor(s) of a layer at a given node. #### Arguments: * `node_index`: Integer, index of the node from which to retrieve the attribute. E.g. `node_index=0` will correspond to the first time the layer was called. #### Returns: A mask tensor (or list of tensors if the layer has multiple outputs).

get_output_shape_at

``` python get_output_shape_at(node_index) ``` Retrieves the output shape(s) of a layer at a given node. #### Arguments: * `node_index`: Integer, index of the node from which to retrieve the attribute. E.g. `node_index=0` will correspond to the first time the layer was called. #### Returns: A shape tuple (or list of shape tuples if the layer has multiple outputs). #### Raises: * `RuntimeError`: If called in Eager mode.

get_updates_for

``` python get_updates_for(inputs) ``` Retrieves updates relevant to a specific set of inputs. #### Arguments: * `inputs`: Input tensor or list/tuple of input tensors. #### Returns: List of update ops of the layer that depend on `inputs`.

get_weights

``` python get_weights() ``` Returns the current weights of the layer. #### Returns: Weights values as a list of numpy arrays.

set_weights

``` python set_weights(weights) ``` Sets the weights of the layer, from Numpy arrays. #### Arguments: * `weights`: a list of Numpy arrays. The number of arrays and their shape must match number of the dimensions of the weights of the layer (i.e. it should match the output of `get_weights`). #### Raises: * `ValueError`: If the provided weights list does not match the layer's specifications.

with_name_scope

``` python @classmethod with_name_scope( cls, method ) ``` Decorator to automatically enter the module name scope. ``` class MyModule(tf.Module): @tf.Module.with_name_scope def __call__(self, x): if not hasattr(self, 'w'): self.w = tf.Variable(tf.random.normal([x.shape[1], 64])) return tf.matmul(x, self.w) ``` Using the above module would produce `tf.Variable`s and `tf.Tensor`s whose names included the module name: ``` mod = MyModule() mod(tf.ones([8, 32])) # ==> mod.w # ==> ``` #### Args: * `method`: The method to wrap. #### Returns: The original method wrapped such that it enters the module's name scope.

zero_state

``` python zero_state( batch_size, dtype ) ``` Return zero-filled state tensor(s). #### Args: * `batch_size`: int, float, or unit Tensor representing the batch size. * `dtype`: the data type to use for the state. #### Returns: If `state_size` is an int or TensorShape, then the return value is a `N-D` tensor of shape `[batch_size, state_size]` filled with zeros. If `state_size` is a nested list or tuple, then the return value is a nested list or tuple (of the same structure) of `2-D` tensors with the shapes `[batch_size, s]` for each s in `state_size`. ================================================ FILE: docs/tf/haste_tf/IndRNN.md ================================================
# haste_tf.IndRNN ## Class `IndRNN` Independently Recurrent Neural Network layer. This layer offers a fused, GPU-accelerated TensorFlow op for inference and training. It also supports Zoneout regularization.

__init__

``` python __init__( num_units, direction='unidirectional', **kwargs ) ``` Initialize the parameters of the IndRNN layer. #### Arguments: * `num_units`: int, the number of units in the IndRNN cell. * `direction`: string, 'unidirectional' or 'bidirectional'. * `**kwargs`: Dict, keyword arguments (see below). #### Keyword Arguments: * `kernel_initializer`: (optional) the initializer to use for the input matrix weights. Defaults to `glorot_uniform`. * `recurrent_initializer`: (optional) the initializer to use for the recurrent scale weights. Defaults to uniform random in [-0.5, 0.5]. Note that this initialization scheme is different than in the original authors' implementation. See https://github.com/lmnt-com/haste/issues/7 for details. * `bias_initializer`: (optional) the initializer to use for the bias vector. Defaults to `zeros`. * `kernel_transform`: (optional) a function with signature `(kernel: Tensor) -> Tensor` that transforms the kernel before it is used. Defaults to the identity function. * `recurrent_transform`: (optional) a function with signature `(recurrent_scale: Tensor) -> Tensor` that transforms the recurrent scale vector before it is used. Defaults to the identity function. * `bias_transform`: (optional) a function with signature `(bias: Tensor) -> Tensor` that transforms the bias before it is used. Defaults to the identity function. * `zoneout`: (optional) float, sets the zoneout rate for Zoneout regularization. Defaults to 0. * `dtype`: (optional) the data type for this layer. Defaults to `tf.float32`. * `name`: (optional) string, the name for this layer. ## Properties

bidirectional

`True` if this is a bidirectional RNN, `False` otherwise.

name

Returns the name of this module as passed or determined in the ctor. NOTE: This is not the same as the `self.name_scope.name` which includes parent module names.

name_scope

Returns a `tf.name_scope` instance for this class.

output_size

state_size

submodules

Sequence of all sub-modules. Submodules are modules which are properties of this module, or found as properties of modules which are properties of this module (and so on). ``` a = tf.Module() b = tf.Module() c = tf.Module() a.b = b b.c = c assert list(a.submodules) == [b, c] assert list(b.submodules) == [c] assert list(c.submodules) == [] ``` #### Returns: A sequence of all submodules.

trainable_variables

Sequence of variables owned by this module and it's submodules. Note: this method uses reflection to find variables on the current instance and submodules. For performance reasons you may wish to cache the result of calling this method if you don't expect the return value to change. #### Returns: A sequence of variables for the current module (sorted by attribute name) followed by variables from all submodules recursively (breadth first).

variables

Sequence of variables owned by this module and it's submodules. Note: this method uses reflection to find variables on the current instance and submodules. For performance reasons you may wish to cache the result of calling this method if you don't expect the return value to change. #### Returns: A sequence of variables for the current module (sorted by attribute name) followed by variables from all submodules recursively (breadth first). ## Methods

__call__

``` python __call__( inputs, training, sequence_length=None, time_major=False ) ``` Runs the RNN layer. #### Arguments: * `inputs`: Tensor, a rank 3 input tensor with shape [N,T,C] if `time_major` is `False`, or with shape [T,N,C] if `time_major` is `True`. * `training`: bool, `True` if running in training mode, `False` if running in inference mode. * `sequence_length`: (optional) Tensor, a rank 1 tensor with shape [N] and dtype of `tf.int32` or `tf.int64`. This tensor specifies the unpadded length of each example in the input minibatch. * `time_major`: (optional) bool, specifies whether `input` has shape [N,T,C] (`time_major=False`) or shape [T,N,C] (`time_major=True`). #### Returns: A pair, `(output, state)` for unidirectional layers, or a pair `([output_fw, output_bw], [state_fw, state_bw])` for bidirectional layers.

build

``` python build(shape) ``` Creates the variables of the layer. Calling this method is optional for users of the RNN class. It is called internally with the correct shape when `__call__` is invoked. #### Arguments: * `shape`: instance of `TensorShape`.

with_name_scope

``` python @classmethod with_name_scope( cls, method ) ``` Decorator to automatically enter the module name scope. ``` class MyModule(tf.Module): @tf.Module.with_name_scope def __call__(self, x): if not hasattr(self, 'w'): self.w = tf.Variable(tf.random.normal([x.shape[1], 64])) return tf.matmul(x, self.w) ``` Using the above module would produce `tf.Variable`s and `tf.Tensor`s whose names included the module name: ``` mod = MyModule() mod(tf.ones([8, 32])) # ==> mod.w # ==> ``` #### Args: * `method`: The method to wrap. #### Returns: The original method wrapped such that it enters the module's name scope. ================================================ FILE: docs/tf/haste_tf/LSTM.md ================================================
# haste_tf.LSTM ## Class `LSTM` Long Short-Term Memory layer. This LSTM layer offers a fused, GPU-accelerated TensorFlow op for inference and training. Its weights and variables are compatible with `BasicLSTMCell`, `LSTMCell`, and `LSTMBlockCell` by default, and is able to load weights from `tf.contrib.cudnn_rnn.CudnnLSTM` when `cudnn_compat=True` is specified. Although this implementation is comparable in performance to cuDNN's LSTM, it offers additional options not typically found in other high-performance implementations. DropConnect and Zoneout regularization are built-in, and this layer allows setting a non-zero initial forget gate bias.

__init__

``` python __init__( num_units, direction='unidirectional', **kwargs ) ``` Initialize the parameters of the LSTM layer. #### Arguments: * `num_units`: int, the number of units in the LSTM cell. * `direction`: string, 'unidirectional' or 'bidirectional'. * `**kwargs`: Dict, keyword arguments (see below). #### Keyword Arguments: * `kernel_initializer`: (optional) the initializer to use for the input matrix weights. Defaults to `glorot_uniform`. * `recurrent_initializer`: (optional) the initializer to use for the recurrent matrix weights. Defaults to `orthogonal`. * `bias_initializer`: (optional) the initializer to use for both input and recurrent bias vectors. Defaults to `zeros` unless `forget_bias` is non-zero (see below). * `kernel_transform`: (optional) a function with signature `(kernel: Tensor) -> Tensor` that transforms the kernel before it is used. Defaults to the identity function. * `recurrent_transform`: (optional) a function with signature `(recurrent_kernel: Tensor) -> Tensor` that transforms the recurrent kernel before it is used. Defaults to the identity function. * `bias_transform`: (optional) a function with signature `(bias: Tensor) -> Tensor` that transforms the bias before it is used. Defaults to the identity function. * `forget_bias`: (optional) float, sets the initial weights for the forget gates. Defaults to 1 and overrides the `bias_initializer` unless this argument is set to 0. * `dropout`: (optional) float, sets the dropout rate for DropConnect regularization on the recurrent matrix. Defaults to 0. * `zoneout`: (optional) float, sets the zoneout rate for Zoneout regularization. Defaults to 0. * `dtype`: (optional) the data type for this layer. Defaults to `tf.float32`. * `name`: (optional) string, the name for this layer. * `cudnn_compat`: (optional) bool, if `True`, the variables created by this layer are compatible with `tf.contrib.cudnn_rnn.CudnnLSTM`. Note that this should only be set if you're restoring variables from a cuDNN model. It's currently not possible to train a model with `cudnn_compat=True` and restore it with CudnnLSTM. Defaults to `False`. ## Properties

bidirectional

`True` if this is a bidirectional RNN, `False` otherwise.

name

Returns the name of this module as passed or determined in the ctor. NOTE: This is not the same as the `self.name_scope.name` which includes parent module names.

name_scope

Returns a `tf.name_scope` instance for this class.

output_size

state_size

submodules

Sequence of all sub-modules. Submodules are modules which are properties of this module, or found as properties of modules which are properties of this module (and so on). ``` a = tf.Module() b = tf.Module() c = tf.Module() a.b = b b.c = c assert list(a.submodules) == [b, c] assert list(b.submodules) == [c] assert list(c.submodules) == [] ``` #### Returns: A sequence of all submodules.

trainable_variables

Sequence of variables owned by this module and it's submodules. Note: this method uses reflection to find variables on the current instance and submodules. For performance reasons you may wish to cache the result of calling this method if you don't expect the return value to change. #### Returns: A sequence of variables for the current module (sorted by attribute name) followed by variables from all submodules recursively (breadth first).

variables

Sequence of variables owned by this module and it's submodules. Note: this method uses reflection to find variables on the current instance and submodules. For performance reasons you may wish to cache the result of calling this method if you don't expect the return value to change. #### Returns: A sequence of variables for the current module (sorted by attribute name) followed by variables from all submodules recursively (breadth first). ## Methods

__call__

``` python __call__( inputs, training, sequence_length=None, time_major=False ) ``` Runs the RNN layer. #### Arguments: * `inputs`: Tensor, a rank 3 input tensor with shape [N,T,C] if `time_major` is `False`, or with shape [T,N,C] if `time_major` is `True`. * `training`: bool, `True` if running in training mode, `False` if running in inference mode. * `sequence_length`: (optional) Tensor, a rank 1 tensor with shape [N] and dtype of `tf.int32` or `tf.int64`. This tensor specifies the unpadded length of each example in the input minibatch. * `time_major`: (optional) bool, specifies whether `input` has shape [N,T,C] (`time_major=False`) or shape [T,N,C] (`time_major=True`). #### Returns: A pair, `(output, state)` for unidirectional layers, or a pair `([output_fw, output_bw], [state_fw, state_bw])` for bidirectional layers.

build

``` python build(shape) ``` Creates the variables of the layer. Calling this method is optional for users of the RNN class. It is called internally with the correct shape when `__call__` is invoked. #### Arguments: * `shape`: instance of `TensorShape`.

with_name_scope

``` python @classmethod with_name_scope( cls, method ) ``` Decorator to automatically enter the module name scope. ``` class MyModule(tf.Module): @tf.Module.with_name_scope def __call__(self, x): if not hasattr(self, 'w'): self.w = tf.Variable(tf.random.normal([x.shape[1], 64])) return tf.matmul(x, self.w) ``` Using the above module would produce `tf.Variable`s and `tf.Tensor`s whose names included the module name: ``` mod = MyModule() mod(tf.ones([8, 32])) # ==> mod.w # ==> ``` #### Args: * `method`: The method to wrap. #### Returns: The original method wrapped such that it enters the module's name scope. ================================================ FILE: docs/tf/haste_tf/LayerNorm.md ================================================
# haste_tf.LayerNorm ## Class `LayerNorm` Layer normalization layer. This class exposes a fused and GPU-accelerated implementation of layer normalization as described by [Ba et al.](https://arxiv.org/abs/1607.06450)

__init__

``` python __init__(name=None) ``` Initialize the parameters of the layer normalization layer. #### Arguments: * `name`: (optional) string, the name for this layer. ## Properties

name

Returns the name of this module as passed or determined in the ctor. NOTE: This is not the same as the `self.name_scope.name` which includes parent module names.

name_scope

Returns a `tf.name_scope` instance for this class.

submodules

Sequence of all sub-modules. Submodules are modules which are properties of this module, or found as properties of modules which are properties of this module (and so on). ``` a = tf.Module() b = tf.Module() c = tf.Module() a.b = b b.c = c assert list(a.submodules) == [b, c] assert list(b.submodules) == [c] assert list(c.submodules) == [] ``` #### Returns: A sequence of all submodules.

trainable_variables

Sequence of variables owned by this module and it's submodules. Note: this method uses reflection to find variables on the current instance and submodules. For performance reasons you may wish to cache the result of calling this method if you don't expect the return value to change. #### Returns: A sequence of variables for the current module (sorted by attribute name) followed by variables from all submodules recursively (breadth first).

variables

Sequence of variables owned by this module and it's submodules. Note: this method uses reflection to find variables on the current instance and submodules. For performance reasons you may wish to cache the result of calling this method if you don't expect the return value to change. #### Returns: A sequence of variables for the current module (sorted by attribute name) followed by variables from all submodules recursively (breadth first). ## Methods

__call__

``` python __call__(x) ``` Runs the layer. #### Arguments: * `x`: Tensor, a rank R tensor. #### Returns: * `y`: Tensor, a rank R tensor with the last dimension normalized.

build

``` python build(shape) ``` Creates the variables of the layer. Calling this method is optional for users of the LayerNorm class. It is called internally with the correct shape when `__call__` is invoked. #### Arguments: * `shape`: instance of `TensorShape`.

with_name_scope

``` python @classmethod with_name_scope( cls, method ) ``` Decorator to automatically enter the module name scope. ``` class MyModule(tf.Module): @tf.Module.with_name_scope def __call__(self, x): if not hasattr(self, 'w'): self.w = tf.Variable(tf.random.normal([x.shape[1], 64])) return tf.matmul(x, self.w) ``` Using the above module would produce `tf.Variable`s and `tf.Tensor`s whose names included the module name: ``` mod = MyModule() mod(tf.ones([8, 32])) # ==> mod.w # ==> ``` #### Args: * `method`: The method to wrap. #### Returns: The original method wrapped such that it enters the module's name scope. ================================================ FILE: docs/tf/haste_tf/LayerNormGRU.md ================================================
# haste_tf.LayerNormGRU ## Class `LayerNormGRU` Layer Normalized Gated Recurrent Unit layer. This GRU layer applies layer normalization to the input and recurrent output activations of a standard GRU. The implementation is fused and GPU-accelerated. There are two commonly-used variants of GRU cells. This one implements 1406.1078v1 which applies the reset gate to the hidden state after matrix multiplication. The other variant, 1406.1078v3, applies the reset gate before matrix multiplication and is currently unsupported. This layer has built-in support for DropConnect and Zoneout, which are both techniques used to regularize RNNs.

__init__

``` python __init__( num_units, direction='unidirectional', **kwargs ) ``` Initialize the parameters of the GRU layer. #### Arguments: * `num_units`: int, the number of units in the LSTM cell. * `direction`: string, 'unidirectional' or 'bidirectional'. * `**kwargs`: Dict, keyword arguments (see below). #### Keyword Arguments: * `kernel_initializer`: (optional) the initializer to use for the input matrix weights. Defaults to `glorot_uniform`. * `recurrent_initializer`: (optional) the initializer to use for the recurrent matrix weights. Defaults to `orthogonal`. * `bias_initializer`: (optional) the initializer to use for input bias vectors. Defaults to `zeros`. * `recurrent_bias_initializer`: (optional) the initializer to use for recurrent bias vectors. Defaults to `zeros`. * `kernel_transform`: (optional) a function with signature `(kernel: Tensor) -> Tensor` that transforms the kernel before it is used. Defaults to the identity function. * `recurrent_transform`: (optional) a function with signature `(recurrent_kernel: Tensor) -> Tensor` that transforms the recurrent kernel before it is used. Defaults to the identity function. * `bias_transform`: (optional) a function with signature `(bias: Tensor) -> Tensor` that transforms the bias before it is used. Defaults to the identity function. * `recurrent_bias_transform`: (optional) a function with signature `(recurrent_bias: Tensor) -> Tensor` that transforms the recurrent bias before it is used. Defaults to the identity function. * `dropout`: (optional) float, sets the dropout rate for DropConnect regularization on the recurrent matrix. Defaults to 0. * `zoneout`: (optional) float, sets the zoneout rate for Zoneout regularization. Defaults to 0. * `dtype`: (optional) the data type for this layer. Defaults to `tf.float32`. * `name`: (optional) string, the name for this layer. ## Properties

bidirectional

`True` if this is a bidirectional RNN, `False` otherwise.

name

Returns the name of this module as passed or determined in the ctor. NOTE: This is not the same as the `self.name_scope.name` which includes parent module names.

name_scope

Returns a `tf.name_scope` instance for this class.

output_size

state_size

submodules

Sequence of all sub-modules. Submodules are modules which are properties of this module, or found as properties of modules which are properties of this module (and so on). ``` a = tf.Module() b = tf.Module() c = tf.Module() a.b = b b.c = c assert list(a.submodules) == [b, c] assert list(b.submodules) == [c] assert list(c.submodules) == [] ``` #### Returns: A sequence of all submodules.

trainable_variables

Sequence of variables owned by this module and it's submodules. Note: this method uses reflection to find variables on the current instance and submodules. For performance reasons you may wish to cache the result of calling this method if you don't expect the return value to change. #### Returns: A sequence of variables for the current module (sorted by attribute name) followed by variables from all submodules recursively (breadth first).

variables

Sequence of variables owned by this module and it's submodules. Note: this method uses reflection to find variables on the current instance and submodules. For performance reasons you may wish to cache the result of calling this method if you don't expect the return value to change. #### Returns: A sequence of variables for the current module (sorted by attribute name) followed by variables from all submodules recursively (breadth first). ## Methods

__call__

``` python __call__( inputs, training, sequence_length=None, time_major=False ) ``` Runs the RNN layer. #### Arguments: * `inputs`: Tensor, a rank 3 input tensor with shape [N,T,C] if `time_major` is `False`, or with shape [T,N,C] if `time_major` is `True`. * `training`: bool, `True` if running in training mode, `False` if running in inference mode. * `sequence_length`: (optional) Tensor, a rank 1 tensor with shape [N] and dtype of `tf.int32` or `tf.int64`. This tensor specifies the unpadded length of each example in the input minibatch. * `time_major`: (optional) bool, specifies whether `input` has shape [N,T,C] (`time_major=False`) or shape [T,N,C] (`time_major=True`). #### Returns: A pair, `(output, state)` for unidirectional layers, or a pair `([output_fw, output_bw], [state_fw, state_bw])` for bidirectional layers.

build

``` python build(shape) ``` Creates the variables of the layer. Calling this method is optional for users of the RNN class. It is called internally with the correct shape when `__call__` is invoked. #### Arguments: * `shape`: instance of `TensorShape`.

with_name_scope

``` python @classmethod with_name_scope( cls, method ) ``` Decorator to automatically enter the module name scope. ``` class MyModule(tf.Module): @tf.Module.with_name_scope def __call__(self, x): if not hasattr(self, 'w'): self.w = tf.Variable(tf.random.normal([x.shape[1], 64])) return tf.matmul(x, self.w) ``` Using the above module would produce `tf.Variable`s and `tf.Tensor`s whose names included the module name: ``` mod = MyModule() mod(tf.ones([8, 32])) # ==> mod.w # ==> ``` #### Args: * `method`: The method to wrap. #### Returns: The original method wrapped such that it enters the module's name scope. ================================================ FILE: docs/tf/haste_tf/LayerNormGRUCell.md ================================================
# haste_tf.LayerNormGRUCell ## Class `LayerNormGRUCell` A GRU cell that's compatible with the Haste LayerNormGRU layer. This cell can be used on hardware other than GPUs and with other TensorFlow classes that operate on RNN cells (e.g. `dynamic_rnn`, `BasicDecoder`, cell wrappers, etc.).

__init__

``` python __init__( num_units, forget_bias=1.0, dropout=0.0, dtype=None, name=None, **kwargs ) ``` ## Properties

activity_regularizer

Optional regularizer function for the output of this layer.

dtype

dynamic

graph

DEPRECATED FUNCTION Warning: THIS FUNCTION IS DEPRECATED. It will be removed in a future version. Instructions for updating: Stop using this property because tf.layers layers no longer track their graph.

input

Retrieves the input tensor(s) of a layer. Only applicable if the layer has exactly one input, i.e. if it is connected to one incoming layer. #### Returns: Input tensor or list of input tensors. #### Raises: * `AttributeError`: if the layer is connected to more than one incoming layers. #### Raises: * `RuntimeError`: If called in Eager mode. * `AttributeError`: If no inbound nodes are found.

input_mask

Retrieves the input mask tensor(s) of a layer. Only applicable if the layer has exactly one inbound node, i.e. if it is connected to one incoming layer. #### Returns: Input mask tensor (potentially None) or list of input mask tensors. #### Raises: * `AttributeError`: if the layer is connected to more than one incoming layers.

input_shape

Retrieves the input shape(s) of a layer. Only applicable if the layer has exactly one input, i.e. if it is connected to one incoming layer, or if all inputs have the same shape. #### Returns: Input shape, as an integer shape tuple (or list of shape tuples, one tuple per input tensor). #### Raises: * `AttributeError`: if the layer has no defined input_shape. * `RuntimeError`: if called in Eager mode.

losses

Losses which are associated with this `Layer`. Variable regularization tensors are created when this property is accessed, so it is eager safe: accessing `losses` under a `tf.GradientTape` will propagate gradients back to the corresponding variables. #### Returns: A list of tensors.

metrics

name

Returns the name of this module as passed or determined in the ctor. NOTE: This is not the same as the `self.name_scope.name` which includes parent module names.

name_scope

Returns a `tf.name_scope` instance for this class.

non_trainable_variables

non_trainable_weights

output

Retrieves the output tensor(s) of a layer. Only applicable if the layer has exactly one output, i.e. if it is connected to one incoming layer. #### Returns: Output tensor or list of output tensors. #### Raises: * `AttributeError`: if the layer is connected to more than one incoming layers. * `RuntimeError`: if called in Eager mode.

output_mask

Retrieves the output mask tensor(s) of a layer. Only applicable if the layer has exactly one inbound node, i.e. if it is connected to one incoming layer. #### Returns: Output mask tensor (potentially None) or list of output mask tensors. #### Raises: * `AttributeError`: if the layer is connected to more than one incoming layers.

output_shape

Retrieves the output shape(s) of a layer. Only applicable if the layer has one output, or if all outputs have the same shape. #### Returns: Output shape, as an integer shape tuple (or list of shape tuples, one tuple per output tensor). #### Raises: * `AttributeError`: if the layer has no defined output shape. * `RuntimeError`: if called in Eager mode.

output_size

Integer or TensorShape: size of outputs produced by this cell.

scope_name

state_size

size(s) of state(s) used by this cell. It can be represented by an Integer, a TensorShape or a tuple of Integers or TensorShapes.

submodules

Sequence of all sub-modules. Submodules are modules which are properties of this module, or found as properties of modules which are properties of this module (and so on). ``` a = tf.Module() b = tf.Module() c = tf.Module() a.b = b b.c = c assert list(a.submodules) == [b, c] assert list(b.submodules) == [c] assert list(c.submodules) == [] ``` #### Returns: A sequence of all submodules.

trainable

trainable_variables

Sequence of variables owned by this module and it's submodules. Note: this method uses reflection to find variables on the current instance and submodules. For performance reasons you may wish to cache the result of calling this method if you don't expect the return value to change. #### Returns: A sequence of variables for the current module (sorted by attribute name) followed by variables from all submodules recursively (breadth first).

trainable_weights

updates

variables

Returns the list of all layer variables/weights. Alias of `self.weights`. #### Returns: A list of variables.

weights

Returns the list of all layer variables/weights. #### Returns: A list of variables. ## Methods

__call__

``` python __call__( inputs, state, scope=None ) ``` Run this RNN cell on inputs, starting from the given state. #### Args: * `inputs`: `2-D` tensor with shape `[batch_size, input_size]`. * `state`: if `self.state_size` is an integer, this should be a `2-D Tensor` with shape `[batch_size, self.state_size]`. Otherwise, if `self.state_size` is a tuple of integers, this should be a tuple with shapes `[batch_size, s] for s in self.state_size`. * `scope`: VariableScope for the created subgraph; defaults to class name. #### Returns: * `A pair containing`: - Output: A `2-D` tensor with shape `[batch_size, self.output_size]`. - New state: Either a single `2-D` tensor, or a tuple of tensors matching the arity and shapes of `state`.

apply

``` python apply( inputs, *args, **kwargs ) ``` Apply the layer on a input. This is an alias of `self.__call__`. #### Arguments: * `inputs`: Input tensor(s). * `*args`: additional positional arguments to be passed to `self.call`. * `**kwargs`: additional keyword arguments to be passed to `self.call`. #### Returns: Output tensor(s).

build

``` python build(shape) ``` Creates the variables of the layer (optional, for subclass implementers). This is a method that implementers of subclasses of `Layer` or `Model` can override if they need a state-creation step in-between layer instantiation and layer call. This is typically used to create the weights of `Layer` subclasses. #### Arguments: * `input_shape`: Instance of `TensorShape`, or list of instances of `TensorShape` if the layer expects a list of inputs (one instance per input).

compute_mask

``` python compute_mask( inputs, mask=None ) ``` Computes an output mask tensor. #### Arguments: * `inputs`: Tensor or list of tensors. * `mask`: Tensor or list of tensors. #### Returns: None or a tensor (or list of tensors, one per output tensor of the layer).

compute_output_shape

``` python compute_output_shape(input_shape) ``` Computes the output shape of the layer. Assumes that the layer will be built to match that input shape provided. #### Arguments: * `input_shape`: Shape tuple (tuple of integers) or list of shape tuples (one per output tensor of the layer). Shape tuples can include None for free dimensions, instead of an integer. #### Returns: An input shape tuple.

count_params

``` python count_params() ``` Count the total number of scalars composing the weights. #### Returns: An integer count. #### Raises: * `ValueError`: if the layer isn't yet built (in which case its weights aren't yet defined).

from_config

``` python @classmethod from_config( cls, config ) ``` Creates a layer from its config. This method is the reverse of `get_config`, capable of instantiating the same layer from the config dictionary. It does not handle layer connectivity (handled by Network), nor weights (handled by `set_weights`). #### Arguments: * `config`: A Python dictionary, typically the output of get_config. #### Returns: A layer instance.

get_config

``` python get_config() ``` Returns the config of the layer. A layer config is a Python dictionary (serializable) containing the configuration of a layer. The same layer can be reinstantiated later (without its trained weights) from this configuration. The config of a layer does not include connectivity information, nor the layer class name. These are handled by `Network` (one layer of abstraction above). #### Returns: Python dictionary.

get_initial_state

``` python get_initial_state( inputs=None, batch_size=None, dtype=None ) ```

get_input_at

``` python get_input_at(node_index) ``` Retrieves the input tensor(s) of a layer at a given node. #### Arguments: * `node_index`: Integer, index of the node from which to retrieve the attribute. E.g. `node_index=0` will correspond to the first time the layer was called. #### Returns: A tensor (or list of tensors if the layer has multiple inputs). #### Raises: * `RuntimeError`: If called in Eager mode.

get_input_mask_at

``` python get_input_mask_at(node_index) ``` Retrieves the input mask tensor(s) of a layer at a given node. #### Arguments: * `node_index`: Integer, index of the node from which to retrieve the attribute. E.g. `node_index=0` will correspond to the first time the layer was called. #### Returns: A mask tensor (or list of tensors if the layer has multiple inputs).

get_input_shape_at

``` python get_input_shape_at(node_index) ``` Retrieves the input shape(s) of a layer at a given node. #### Arguments: * `node_index`: Integer, index of the node from which to retrieve the attribute. E.g. `node_index=0` will correspond to the first time the layer was called. #### Returns: A shape tuple (or list of shape tuples if the layer has multiple inputs). #### Raises: * `RuntimeError`: If called in Eager mode.

get_losses_for

``` python get_losses_for(inputs) ``` Retrieves losses relevant to a specific set of inputs. #### Arguments: * `inputs`: Input tensor or list/tuple of input tensors. #### Returns: List of loss tensors of the layer that depend on `inputs`.

get_output_at

``` python get_output_at(node_index) ``` Retrieves the output tensor(s) of a layer at a given node. #### Arguments: * `node_index`: Integer, index of the node from which to retrieve the attribute. E.g. `node_index=0` will correspond to the first time the layer was called. #### Returns: A tensor (or list of tensors if the layer has multiple outputs). #### Raises: * `RuntimeError`: If called in Eager mode.

get_output_mask_at

``` python get_output_mask_at(node_index) ``` Retrieves the output mask tensor(s) of a layer at a given node. #### Arguments: * `node_index`: Integer, index of the node from which to retrieve the attribute. E.g. `node_index=0` will correspond to the first time the layer was called. #### Returns: A mask tensor (or list of tensors if the layer has multiple outputs).

get_output_shape_at

``` python get_output_shape_at(node_index) ``` Retrieves the output shape(s) of a layer at a given node. #### Arguments: * `node_index`: Integer, index of the node from which to retrieve the attribute. E.g. `node_index=0` will correspond to the first time the layer was called. #### Returns: A shape tuple (or list of shape tuples if the layer has multiple outputs). #### Raises: * `RuntimeError`: If called in Eager mode.

get_updates_for

``` python get_updates_for(inputs) ``` Retrieves updates relevant to a specific set of inputs. #### Arguments: * `inputs`: Input tensor or list/tuple of input tensors. #### Returns: List of update ops of the layer that depend on `inputs`.

get_weights

``` python get_weights() ``` Returns the current weights of the layer. #### Returns: Weights values as a list of numpy arrays.

set_weights

``` python set_weights(weights) ``` Sets the weights of the layer, from Numpy arrays. #### Arguments: * `weights`: a list of Numpy arrays. The number of arrays and their shape must match number of the dimensions of the weights of the layer (i.e. it should match the output of `get_weights`). #### Raises: * `ValueError`: If the provided weights list does not match the layer's specifications.

with_name_scope

``` python @classmethod with_name_scope( cls, method ) ``` Decorator to automatically enter the module name scope. ``` class MyModule(tf.Module): @tf.Module.with_name_scope def __call__(self, x): if not hasattr(self, 'w'): self.w = tf.Variable(tf.random.normal([x.shape[1], 64])) return tf.matmul(x, self.w) ``` Using the above module would produce `tf.Variable`s and `tf.Tensor`s whose names included the module name: ``` mod = MyModule() mod(tf.ones([8, 32])) # ==> mod.w # ==> ``` #### Args: * `method`: The method to wrap. #### Returns: The original method wrapped such that it enters the module's name scope.

zero_state

``` python zero_state( batch_size, dtype ) ``` Return zero-filled state tensor(s). #### Args: * `batch_size`: int, float, or unit Tensor representing the batch size. * `dtype`: the data type to use for the state. #### Returns: If `state_size` is an int or TensorShape, then the return value is a `N-D` tensor of shape `[batch_size, state_size]` filled with zeros. If `state_size` is a nested list or tuple, then the return value is a nested list or tuple (of the same structure) of `2-D` tensors with the shapes `[batch_size, s]` for each s in `state_size`. ================================================ FILE: docs/tf/haste_tf/LayerNormLSTM.md ================================================
# haste_tf.LayerNormLSTM ## Class `LayerNormLSTM` Layer Normalized Long Short-Term Memory layer. This LSTM layer applies layer normalization to the input, recurrent, and output activations of a standard LSTM. The implementation is fused and GPU-accelerated. DropConnect and Zoneout regularization are built-in, and this layer allows setting a non-zero initial forget gate bias. Details about the exact function this layer implements can be found at https://github.com/lmnt-com/haste/issues/1.

__init__

``` python __init__( num_units, direction='unidirectional', **kwargs ) ``` Initialize the parameters of the LSTM layer. #### Arguments: * `num_units`: int, the number of units in the LSTM cell. * `direction`: string, 'unidirectional' or 'bidirectional'. * `**kwargs`: Dict, keyword arguments (see below). #### Keyword Arguments: * `kernel_initializer`: (optional) the initializer to use for the input matrix weights. Defaults to `glorot_uniform`. * `recurrent_initializer`: (optional) the initializer to use for the recurrent matrix weights. Defaults to `orthogonal`. * `bias_initializer`: (optional) the initializer to use for both input and recurrent bias vectors. Defaults to `zeros` unless `forget_bias` is non-zero (see below). * `kernel_transform`: (optional) a function with signature `(kernel: Tensor) -> Tensor` that transforms the kernel before it is used. Defaults to the identity function. * `recurrent_transform`: (optional) a function with signature `(recurrent_kernel: Tensor) -> Tensor` that transforms the recurrent kernel before it is used. Defaults to the identity function. * `bias_transform`: (optional) a function with signature `(bias: Tensor) -> Tensor` that transforms the bias before it is used. Defaults to the identity function. * `forget_bias`: (optional) float, sets the initial weights for the forget gates. Defaults to 1 and overrides the `bias_initializer` unless this argument is set to 0. * `dropout`: (optional) float, sets the dropout rate for DropConnect regularization on the recurrent matrix. Defaults to 0. * `zoneout`: (optional) float, sets the zoneout rate for Zoneout regularization. Defaults to 0. * `dtype`: (optional) the data type for this layer. Defaults to `tf.float32`. * `name`: (optional) string, the name for this layer. ## Properties

bidirectional

`True` if this is a bidirectional RNN, `False` otherwise.

name

Returns the name of this module as passed or determined in the ctor. NOTE: This is not the same as the `self.name_scope.name` which includes parent module names.

name_scope

Returns a `tf.name_scope` instance for this class.

output_size

state_size

submodules

Sequence of all sub-modules. Submodules are modules which are properties of this module, or found as properties of modules which are properties of this module (and so on). ``` a = tf.Module() b = tf.Module() c = tf.Module() a.b = b b.c = c assert list(a.submodules) == [b, c] assert list(b.submodules) == [c] assert list(c.submodules) == [] ``` #### Returns: A sequence of all submodules.

trainable_variables

Sequence of variables owned by this module and it's submodules. Note: this method uses reflection to find variables on the current instance and submodules. For performance reasons you may wish to cache the result of calling this method if you don't expect the return value to change. #### Returns: A sequence of variables for the current module (sorted by attribute name) followed by variables from all submodules recursively (breadth first).

variables

Sequence of variables owned by this module and it's submodules. Note: this method uses reflection to find variables on the current instance and submodules. For performance reasons you may wish to cache the result of calling this method if you don't expect the return value to change. #### Returns: A sequence of variables for the current module (sorted by attribute name) followed by variables from all submodules recursively (breadth first). ## Methods

__call__

``` python __call__( inputs, training, sequence_length=None, time_major=False ) ``` Runs the RNN layer. #### Arguments: * `inputs`: Tensor, a rank 3 input tensor with shape [N,T,C] if `time_major` is `False`, or with shape [T,N,C] if `time_major` is `True`. * `training`: bool, `True` if running in training mode, `False` if running in inference mode. * `sequence_length`: (optional) Tensor, a rank 1 tensor with shape [N] and dtype of `tf.int32` or `tf.int64`. This tensor specifies the unpadded length of each example in the input minibatch. * `time_major`: (optional) bool, specifies whether `input` has shape [N,T,C] (`time_major=False`) or shape [T,N,C] (`time_major=True`). #### Returns: A pair, `(output, state)` for unidirectional layers, or a pair `([output_fw, output_bw], [state_fw, state_bw])` for bidirectional layers.

build

``` python build(shape) ``` Creates the variables of the layer. Calling this method is optional for users of the RNN class. It is called internally with the correct shape when `__call__` is invoked. #### Arguments: * `shape`: instance of `TensorShape`.

with_name_scope

``` python @classmethod with_name_scope( cls, method ) ``` Decorator to automatically enter the module name scope. ``` class MyModule(tf.Module): @tf.Module.with_name_scope def __call__(self, x): if not hasattr(self, 'w'): self.w = tf.Variable(tf.random.normal([x.shape[1], 64])) return tf.matmul(x, self.w) ``` Using the above module would produce `tf.Variable`s and `tf.Tensor`s whose names included the module name: ``` mod = MyModule() mod(tf.ones([8, 32])) # ==> mod.w # ==> ``` #### Args: * `method`: The method to wrap. #### Returns: The original method wrapped such that it enters the module's name scope. ================================================ FILE: docs/tf/haste_tf/LayerNormLSTMCell.md ================================================
# haste_tf.LayerNormLSTMCell ## Class `LayerNormLSTMCell` An LSTM cell that's compatible with the Haste LayerNormLSTM layer. This cell can be used on hardware other than GPUs and with other TensorFlow classes that operate on RNN cells (e.g. `dynamic_rnn`, `BasicDecoder`, cell wrappers, etc.).

__init__

``` python __init__( num_units, forget_bias=1.0, dropout=0.0, dtype=None, name=None, **kwargs ) ``` ## Properties

activity_regularizer

Optional regularizer function for the output of this layer.

dtype

dynamic

graph

DEPRECATED FUNCTION Warning: THIS FUNCTION IS DEPRECATED. It will be removed in a future version. Instructions for updating: Stop using this property because tf.layers layers no longer track their graph.

input

Retrieves the input tensor(s) of a layer. Only applicable if the layer has exactly one input, i.e. if it is connected to one incoming layer. #### Returns: Input tensor or list of input tensors. #### Raises: * `AttributeError`: if the layer is connected to more than one incoming layers. #### Raises: * `RuntimeError`: If called in Eager mode. * `AttributeError`: If no inbound nodes are found.

input_mask

Retrieves the input mask tensor(s) of a layer. Only applicable if the layer has exactly one inbound node, i.e. if it is connected to one incoming layer. #### Returns: Input mask tensor (potentially None) or list of input mask tensors. #### Raises: * `AttributeError`: if the layer is connected to more than one incoming layers.

input_shape

Retrieves the input shape(s) of a layer. Only applicable if the layer has exactly one input, i.e. if it is connected to one incoming layer, or if all inputs have the same shape. #### Returns: Input shape, as an integer shape tuple (or list of shape tuples, one tuple per input tensor). #### Raises: * `AttributeError`: if the layer has no defined input_shape. * `RuntimeError`: if called in Eager mode.

losses

Losses which are associated with this `Layer`. Variable regularization tensors are created when this property is accessed, so it is eager safe: accessing `losses` under a `tf.GradientTape` will propagate gradients back to the corresponding variables. #### Returns: A list of tensors.

metrics

name

Returns the name of this module as passed or determined in the ctor. NOTE: This is not the same as the `self.name_scope.name` which includes parent module names.

name_scope

Returns a `tf.name_scope` instance for this class.

non_trainable_variables

non_trainable_weights

output

Retrieves the output tensor(s) of a layer. Only applicable if the layer has exactly one output, i.e. if it is connected to one incoming layer. #### Returns: Output tensor or list of output tensors. #### Raises: * `AttributeError`: if the layer is connected to more than one incoming layers. * `RuntimeError`: if called in Eager mode.

output_mask

Retrieves the output mask tensor(s) of a layer. Only applicable if the layer has exactly one inbound node, i.e. if it is connected to one incoming layer. #### Returns: Output mask tensor (potentially None) or list of output mask tensors. #### Raises: * `AttributeError`: if the layer is connected to more than one incoming layers.

output_shape

Retrieves the output shape(s) of a layer. Only applicable if the layer has one output, or if all outputs have the same shape. #### Returns: Output shape, as an integer shape tuple (or list of shape tuples, one tuple per output tensor). #### Raises: * `AttributeError`: if the layer has no defined output shape. * `RuntimeError`: if called in Eager mode.

output_size

Integer or TensorShape: size of outputs produced by this cell.

scope_name

state_size

size(s) of state(s) used by this cell. It can be represented by an Integer, a TensorShape or a tuple of Integers or TensorShapes.

submodules

Sequence of all sub-modules. Submodules are modules which are properties of this module, or found as properties of modules which are properties of this module (and so on). ``` a = tf.Module() b = tf.Module() c = tf.Module() a.b = b b.c = c assert list(a.submodules) == [b, c] assert list(b.submodules) == [c] assert list(c.submodules) == [] ``` #### Returns: A sequence of all submodules.

trainable

trainable_variables

Sequence of variables owned by this module and it's submodules. Note: this method uses reflection to find variables on the current instance and submodules. For performance reasons you may wish to cache the result of calling this method if you don't expect the return value to change. #### Returns: A sequence of variables for the current module (sorted by attribute name) followed by variables from all submodules recursively (breadth first).

trainable_weights

updates

variables

Returns the list of all layer variables/weights. Alias of `self.weights`. #### Returns: A list of variables.

weights

Returns the list of all layer variables/weights. #### Returns: A list of variables. ## Methods

__call__

``` python __call__( inputs, state, scope=None ) ``` Run this RNN cell on inputs, starting from the given state. #### Args: * `inputs`: `2-D` tensor with shape `[batch_size, input_size]`. * `state`: if `self.state_size` is an integer, this should be a `2-D Tensor` with shape `[batch_size, self.state_size]`. Otherwise, if `self.state_size` is a tuple of integers, this should be a tuple with shapes `[batch_size, s] for s in self.state_size`. * `scope`: VariableScope for the created subgraph; defaults to class name. #### Returns: * `A pair containing`: - Output: A `2-D` tensor with shape `[batch_size, self.output_size]`. - New state: Either a single `2-D` tensor, or a tuple of tensors matching the arity and shapes of `state`.

apply

``` python apply( inputs, *args, **kwargs ) ``` Apply the layer on a input. This is an alias of `self.__call__`. #### Arguments: * `inputs`: Input tensor(s). * `*args`: additional positional arguments to be passed to `self.call`. * `**kwargs`: additional keyword arguments to be passed to `self.call`. #### Returns: Output tensor(s).

build

``` python build(shape) ``` Creates the variables of the layer (optional, for subclass implementers). This is a method that implementers of subclasses of `Layer` or `Model` can override if they need a state-creation step in-between layer instantiation and layer call. This is typically used to create the weights of `Layer` subclasses. #### Arguments: * `input_shape`: Instance of `TensorShape`, or list of instances of `TensorShape` if the layer expects a list of inputs (one instance per input).

compute_mask

``` python compute_mask( inputs, mask=None ) ``` Computes an output mask tensor. #### Arguments: * `inputs`: Tensor or list of tensors. * `mask`: Tensor or list of tensors. #### Returns: None or a tensor (or list of tensors, one per output tensor of the layer).

compute_output_shape

``` python compute_output_shape(input_shape) ``` Computes the output shape of the layer. Assumes that the layer will be built to match that input shape provided. #### Arguments: * `input_shape`: Shape tuple (tuple of integers) or list of shape tuples (one per output tensor of the layer). Shape tuples can include None for free dimensions, instead of an integer. #### Returns: An input shape tuple.

count_params

``` python count_params() ``` Count the total number of scalars composing the weights. #### Returns: An integer count. #### Raises: * `ValueError`: if the layer isn't yet built (in which case its weights aren't yet defined).

from_config

``` python @classmethod from_config( cls, config ) ``` Creates a layer from its config. This method is the reverse of `get_config`, capable of instantiating the same layer from the config dictionary. It does not handle layer connectivity (handled by Network), nor weights (handled by `set_weights`). #### Arguments: * `config`: A Python dictionary, typically the output of get_config. #### Returns: A layer instance.

get_config

``` python get_config() ``` Returns the config of the layer. A layer config is a Python dictionary (serializable) containing the configuration of a layer. The same layer can be reinstantiated later (without its trained weights) from this configuration. The config of a layer does not include connectivity information, nor the layer class name. These are handled by `Network` (one layer of abstraction above). #### Returns: Python dictionary.

get_initial_state

``` python get_initial_state( inputs=None, batch_size=None, dtype=None ) ```

get_input_at

``` python get_input_at(node_index) ``` Retrieves the input tensor(s) of a layer at a given node. #### Arguments: * `node_index`: Integer, index of the node from which to retrieve the attribute. E.g. `node_index=0` will correspond to the first time the layer was called. #### Returns: A tensor (or list of tensors if the layer has multiple inputs). #### Raises: * `RuntimeError`: If called in Eager mode.

get_input_mask_at

``` python get_input_mask_at(node_index) ``` Retrieves the input mask tensor(s) of a layer at a given node. #### Arguments: * `node_index`: Integer, index of the node from which to retrieve the attribute. E.g. `node_index=0` will correspond to the first time the layer was called. #### Returns: A mask tensor (or list of tensors if the layer has multiple inputs).

get_input_shape_at

``` python get_input_shape_at(node_index) ``` Retrieves the input shape(s) of a layer at a given node. #### Arguments: * `node_index`: Integer, index of the node from which to retrieve the attribute. E.g. `node_index=0` will correspond to the first time the layer was called. #### Returns: A shape tuple (or list of shape tuples if the layer has multiple inputs). #### Raises: * `RuntimeError`: If called in Eager mode.

get_losses_for

``` python get_losses_for(inputs) ``` Retrieves losses relevant to a specific set of inputs. #### Arguments: * `inputs`: Input tensor or list/tuple of input tensors. #### Returns: List of loss tensors of the layer that depend on `inputs`.

get_output_at

``` python get_output_at(node_index) ``` Retrieves the output tensor(s) of a layer at a given node. #### Arguments: * `node_index`: Integer, index of the node from which to retrieve the attribute. E.g. `node_index=0` will correspond to the first time the layer was called. #### Returns: A tensor (or list of tensors if the layer has multiple outputs). #### Raises: * `RuntimeError`: If called in Eager mode.

get_output_mask_at

``` python get_output_mask_at(node_index) ``` Retrieves the output mask tensor(s) of a layer at a given node. #### Arguments: * `node_index`: Integer, index of the node from which to retrieve the attribute. E.g. `node_index=0` will correspond to the first time the layer was called. #### Returns: A mask tensor (or list of tensors if the layer has multiple outputs).

get_output_shape_at

``` python get_output_shape_at(node_index) ``` Retrieves the output shape(s) of a layer at a given node. #### Arguments: * `node_index`: Integer, index of the node from which to retrieve the attribute. E.g. `node_index=0` will correspond to the first time the layer was called. #### Returns: A shape tuple (or list of shape tuples if the layer has multiple outputs). #### Raises: * `RuntimeError`: If called in Eager mode.

get_updates_for

``` python get_updates_for(inputs) ``` Retrieves updates relevant to a specific set of inputs. #### Arguments: * `inputs`: Input tensor or list/tuple of input tensors. #### Returns: List of update ops of the layer that depend on `inputs`.

get_weights

``` python get_weights() ``` Returns the current weights of the layer. #### Returns: Weights values as a list of numpy arrays.

set_weights

``` python set_weights(weights) ``` Sets the weights of the layer, from Numpy arrays. #### Arguments: * `weights`: a list of Numpy arrays. The number of arrays and their shape must match number of the dimensions of the weights of the layer (i.e. it should match the output of `get_weights`). #### Raises: * `ValueError`: If the provided weights list does not match the layer's specifications.

with_name_scope

``` python @classmethod with_name_scope( cls, method ) ``` Decorator to automatically enter the module name scope. ``` class MyModule(tf.Module): @tf.Module.with_name_scope def __call__(self, x): if not hasattr(self, 'w'): self.w = tf.Variable(tf.random.normal([x.shape[1], 64])) return tf.matmul(x, self.w) ``` Using the above module would produce `tf.Variable`s and `tf.Tensor`s whose names included the module name: ``` mod = MyModule() mod(tf.ones([8, 32])) # ==> mod.w # ==> ``` #### Args: * `method`: The method to wrap. #### Returns: The original method wrapped such that it enters the module's name scope.

zero_state

``` python zero_state( batch_size, dtype ) ``` Return zero-filled state tensor(s). #### Args: * `batch_size`: int, float, or unit Tensor representing the batch size. * `dtype`: the data type to use for the state. #### Returns: If `state_size` is an int or TensorShape, then the return value is a `N-D` tensor of shape `[batch_size, state_size]` filled with zeros. If `state_size` is a nested list or tuple, then the return value is a nested list or tuple (of the same structure) of `2-D` tensors with the shapes `[batch_size, s]` for each s in `state_size`. ================================================ FILE: docs/tf/haste_tf/ZoneoutWrapper.md ================================================
# haste_tf.ZoneoutWrapper ## Class `ZoneoutWrapper` An LSTM/GRU cell wrapper that applies zoneout to the inner cell's hidden state. The zoneout paper applies zoneout to both the cell state and hidden state, each with its own zoneout rate. This class (and the `LSTM` implementation in Haste) applies zoneout to the hidden state and not the cell state.

__init__

``` python __init__( cell, rate, training ) ``` Initialize the parameters of the zoneout wrapper. #### Arguments: * `cell`: RNNCell, an instance of {`BasicLSTMCell`, `LSTMCell`, `LSTMBlockCell`, haste_tf.GRUCell} on which to apply zoneout. * `rate`: float, 0 <= rate <= 1, the percent of hidden units to zone out per time step. * `training`: bool, `True` if used during training, `False` if used during inference. ## Properties

activity_regularizer

Optional regularizer function for the output of this layer.

dtype

dynamic

graph

DEPRECATED FUNCTION Warning: THIS FUNCTION IS DEPRECATED. It will be removed in a future version. Instructions for updating: Stop using this property because tf.layers layers no longer track their graph.

input

Retrieves the input tensor(s) of a layer. Only applicable if the layer has exactly one input, i.e. if it is connected to one incoming layer. #### Returns: Input tensor or list of input tensors. #### Raises: * `AttributeError`: if the layer is connected to more than one incoming layers. #### Raises: * `RuntimeError`: If called in Eager mode. * `AttributeError`: If no inbound nodes are found.

input_mask

Retrieves the input mask tensor(s) of a layer. Only applicable if the layer has exactly one inbound node, i.e. if it is connected to one incoming layer. #### Returns: Input mask tensor (potentially None) or list of input mask tensors. #### Raises: * `AttributeError`: if the layer is connected to more than one incoming layers.

input_shape

Retrieves the input shape(s) of a layer. Only applicable if the layer has exactly one input, i.e. if it is connected to one incoming layer, or if all inputs have the same shape. #### Returns: Input shape, as an integer shape tuple (or list of shape tuples, one tuple per input tensor). #### Raises: * `AttributeError`: if the layer has no defined input_shape. * `RuntimeError`: if called in Eager mode.

losses

Losses which are associated with this `Layer`. Variable regularization tensors are created when this property is accessed, so it is eager safe: accessing `losses` under a `tf.GradientTape` will propagate gradients back to the corresponding variables. #### Returns: A list of tensors.

metrics

name

Returns the name of this module as passed or determined in the ctor. NOTE: This is not the same as the `self.name_scope.name` which includes parent module names.

name_scope

Returns a `tf.name_scope` instance for this class.

non_trainable_variables

non_trainable_weights

output

Retrieves the output tensor(s) of a layer. Only applicable if the layer has exactly one output, i.e. if it is connected to one incoming layer. #### Returns: Output tensor or list of output tensors. #### Raises: * `AttributeError`: if the layer is connected to more than one incoming layers. * `RuntimeError`: if called in Eager mode.

output_mask

Retrieves the output mask tensor(s) of a layer. Only applicable if the layer has exactly one inbound node, i.e. if it is connected to one incoming layer. #### Returns: Output mask tensor (potentially None) or list of output mask tensors. #### Raises: * `AttributeError`: if the layer is connected to more than one incoming layers.

output_shape

Retrieves the output shape(s) of a layer. Only applicable if the layer has one output, or if all outputs have the same shape. #### Returns: Output shape, as an integer shape tuple (or list of shape tuples, one tuple per output tensor). #### Raises: * `AttributeError`: if the layer has no defined output shape. * `RuntimeError`: if called in Eager mode.

output_size

Integer or TensorShape: size of outputs produced by this cell.

scope_name

state_size

size(s) of state(s) used by this cell. It can be represented by an Integer, a TensorShape or a tuple of Integers or TensorShapes.

submodules

Sequence of all sub-modules. Submodules are modules which are properties of this module, or found as properties of modules which are properties of this module (and so on). ``` a = tf.Module() b = tf.Module() c = tf.Module() a.b = b b.c = c assert list(a.submodules) == [b, c] assert list(b.submodules) == [c] assert list(c.submodules) == [] ``` #### Returns: A sequence of all submodules.

trainable

trainable_variables

Sequence of variables owned by this module and it's submodules. Note: this method uses reflection to find variables on the current instance and submodules. For performance reasons you may wish to cache the result of calling this method if you don't expect the return value to change. #### Returns: A sequence of variables for the current module (sorted by attribute name) followed by variables from all submodules recursively (breadth first).

trainable_weights

updates

variables

Returns the list of all layer variables/weights. Alias of `self.weights`. #### Returns: A list of variables.

weights

Returns the list of all layer variables/weights. #### Returns: A list of variables. ## Methods

__call__

``` python __call__( inputs, state, scope=None ) ``` Runs one step of the RNN cell with zoneout applied. #### Arguments: see documentation for the inner cell.

apply

``` python apply( inputs, *args, **kwargs ) ``` Apply the layer on a input. This is an alias of `self.__call__`. #### Arguments: * `inputs`: Input tensor(s). * `*args`: additional positional arguments to be passed to `self.call`. * `**kwargs`: additional keyword arguments to be passed to `self.call`. #### Returns: Output tensor(s).

build

``` python build(_) ``` Creates the variables of the layer (optional, for subclass implementers). This is a method that implementers of subclasses of `Layer` or `Model` can override if they need a state-creation step in-between layer instantiation and layer call. This is typically used to create the weights of `Layer` subclasses. #### Arguments: * `input_shape`: Instance of `TensorShape`, or list of instances of `TensorShape` if the layer expects a list of inputs (one instance per input).

compute_mask

``` python compute_mask( inputs, mask=None ) ``` Computes an output mask tensor. #### Arguments: * `inputs`: Tensor or list of tensors. * `mask`: Tensor or list of tensors. #### Returns: None or a tensor (or list of tensors, one per output tensor of the layer).

compute_output_shape

``` python compute_output_shape(input_shape) ``` Computes the output shape of the layer. Assumes that the layer will be built to match that input shape provided. #### Arguments: * `input_shape`: Shape tuple (tuple of integers) or list of shape tuples (one per output tensor of the layer). Shape tuples can include None for free dimensions, instead of an integer. #### Returns: An input shape tuple.

count_params

``` python count_params() ``` Count the total number of scalars composing the weights. #### Returns: An integer count. #### Raises: * `ValueError`: if the layer isn't yet built (in which case its weights aren't yet defined).

from_config

``` python @classmethod from_config( cls, config ) ``` Creates a layer from its config. This method is the reverse of `get_config`, capable of instantiating the same layer from the config dictionary. It does not handle layer connectivity (handled by Network), nor weights (handled by `set_weights`). #### Arguments: * `config`: A Python dictionary, typically the output of get_config. #### Returns: A layer instance.

get_config

``` python get_config() ``` Returns the config of the layer. A layer config is a Python dictionary (serializable) containing the configuration of a layer. The same layer can be reinstantiated later (without its trained weights) from this configuration. The config of a layer does not include connectivity information, nor the layer class name. These are handled by `Network` (one layer of abstraction above). #### Returns: Python dictionary.

get_initial_state

``` python get_initial_state( inputs=None, batch_size=None, dtype=None ) ```

get_input_at

``` python get_input_at(node_index) ``` Retrieves the input tensor(s) of a layer at a given node. #### Arguments: * `node_index`: Integer, index of the node from which to retrieve the attribute. E.g. `node_index=0` will correspond to the first time the layer was called. #### Returns: A tensor (or list of tensors if the layer has multiple inputs). #### Raises: * `RuntimeError`: If called in Eager mode.

get_input_mask_at

``` python get_input_mask_at(node_index) ``` Retrieves the input mask tensor(s) of a layer at a given node. #### Arguments: * `node_index`: Integer, index of the node from which to retrieve the attribute. E.g. `node_index=0` will correspond to the first time the layer was called. #### Returns: A mask tensor (or list of tensors if the layer has multiple inputs).

get_input_shape_at

``` python get_input_shape_at(node_index) ``` Retrieves the input shape(s) of a layer at a given node. #### Arguments: * `node_index`: Integer, index of the node from which to retrieve the attribute. E.g. `node_index=0` will correspond to the first time the layer was called. #### Returns: A shape tuple (or list of shape tuples if the layer has multiple inputs). #### Raises: * `RuntimeError`: If called in Eager mode.

get_losses_for

``` python get_losses_for(inputs) ``` Retrieves losses relevant to a specific set of inputs. #### Arguments: * `inputs`: Input tensor or list/tuple of input tensors. #### Returns: List of loss tensors of the layer that depend on `inputs`.

get_output_at

``` python get_output_at(node_index) ``` Retrieves the output tensor(s) of a layer at a given node. #### Arguments: * `node_index`: Integer, index of the node from which to retrieve the attribute. E.g. `node_index=0` will correspond to the first time the layer was called. #### Returns: A tensor (or list of tensors if the layer has multiple outputs). #### Raises: * `RuntimeError`: If called in Eager mode.

get_output_mask_at

``` python get_output_mask_at(node_index) ``` Retrieves the output mask tensor(s) of a layer at a given node. #### Arguments: * `node_index`: Integer, index of the node from which to retrieve the attribute. E.g. `node_index=0` will correspond to the first time the layer was called. #### Returns: A mask tensor (or list of tensors if the layer has multiple outputs).

get_output_shape_at

``` python get_output_shape_at(node_index) ``` Retrieves the output shape(s) of a layer at a given node. #### Arguments: * `node_index`: Integer, index of the node from which to retrieve the attribute. E.g. `node_index=0` will correspond to the first time the layer was called. #### Returns: A shape tuple (or list of shape tuples if the layer has multiple outputs). #### Raises: * `RuntimeError`: If called in Eager mode.

get_updates_for

``` python get_updates_for(inputs) ``` Retrieves updates relevant to a specific set of inputs. #### Arguments: * `inputs`: Input tensor or list/tuple of input tensors. #### Returns: List of update ops of the layer that depend on `inputs`.

get_weights

``` python get_weights() ``` Returns the current weights of the layer. #### Returns: Weights values as a list of numpy arrays.

set_weights

``` python set_weights(weights) ``` Sets the weights of the layer, from Numpy arrays. #### Arguments: * `weights`: a list of Numpy arrays. The number of arrays and their shape must match number of the dimensions of the weights of the layer (i.e. it should match the output of `get_weights`). #### Raises: * `ValueError`: If the provided weights list does not match the layer's specifications.

with_name_scope

``` python @classmethod with_name_scope( cls, method ) ``` Decorator to automatically enter the module name scope. ``` class MyModule(tf.Module): @tf.Module.with_name_scope def __call__(self, x): if not hasattr(self, 'w'): self.w = tf.Variable(tf.random.normal([x.shape[1], 64])) return tf.matmul(x, self.w) ``` Using the above module would produce `tf.Variable`s and `tf.Tensor`s whose names included the module name: ``` mod = MyModule() mod(tf.ones([8, 32])) # ==> mod.w # ==> ``` #### Args: * `method`: The method to wrap. #### Returns: The original method wrapped such that it enters the module's name scope.

zero_state

``` python zero_state( batch_size, dtype ) ``` Return zero-filled state tensor(s). #### Args: * `batch_size`: int, float, or unit Tensor representing the batch size. * `dtype`: the data type to use for the state. #### Returns: If `state_size` is an int or TensorShape, then the return value is a `N-D` tensor of shape `[batch_size, state_size]` filled with zeros. If `state_size` is a nested list or tuple, then the return value is a nested list or tuple (of the same structure) of `2-D` tensors with the shapes `[batch_size, s]` for each s in `state_size`. ================================================ FILE: docs/tf/haste_tf.md ================================================
# Module: haste_tf Haste: a fast, simple, and open RNN library. ## Classes [`class GRU`](./haste_tf/GRU.md): Gated Recurrent Unit layer. [`class GRUCell`](./haste_tf/GRUCell.md): A GRU cell that's compatible with the Haste GRU layer. [`class IndRNN`](./haste_tf/IndRNN.md): Independently Recurrent Neural Network layer. [`class LSTM`](./haste_tf/LSTM.md): Long Short-Term Memory layer. [`class LayerNorm`](./haste_tf/LayerNorm.md): Layer normalization layer. [`class LayerNormGRU`](./haste_tf/LayerNormGRU.md): Layer Normalized Gated Recurrent Unit layer. [`class LayerNormGRUCell`](./haste_tf/LayerNormGRUCell.md): A GRU cell that's compatible with the Haste LayerNormGRU layer. [`class LayerNormLSTM`](./haste_tf/LayerNormLSTM.md): Layer Normalized Long Short-Term Memory layer. [`class LayerNormLSTMCell`](./haste_tf/LayerNormLSTMCell.md): An LSTM cell that's compatible with the Haste LayerNormLSTM layer. [`class ZoneoutWrapper`](./haste_tf/ZoneoutWrapper.md): An LSTM/GRU cell wrapper that applies zoneout to the inner cell's hidden state. ================================================ FILE: examples/device_ptr.h ================================================ // Copyright 2020 LMNT, Inc. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // ============================================================================== #pragma once #include #include #include template struct device_ptr { static constexpr size_t ElemSize = sizeof(typename T::Scalar); static device_ptr NewByteSized(size_t bytes) { return device_ptr((bytes + ElemSize - 1) / ElemSize); } explicit device_ptr(size_t size_) : data(nullptr), size(size_) { void* tmp; cudaMalloc(&tmp, size * ElemSize); data = static_cast(tmp); } explicit device_ptr(const T& elem) : data(nullptr), size(elem.size()) { void* tmp; cudaMalloc(&tmp, size * ElemSize); data = static_cast(tmp); ToDevice(elem); } device_ptr(device_ptr&& other) : data(other.data), size(other.size) { other.data = nullptr; other.size = 0; } device_ptr& operator=(const device_ptr&& other) { if (&other != this) { data = other.data; size = other.size; other.data = nullptr; other.size = 0; } return *this; } device_ptr(const device_ptr& other) = delete; device_ptr& operator=(const device_ptr& other) = delete; void ToDevice(const T& src) { assert(size == src.size()); cudaMemcpy(data, src.data(), src.size() * ElemSize, cudaMemcpyHostToDevice); } void ToHost(T& target) const { assert(size == target.size()); cudaMemcpy(target.data(), data, target.size() * ElemSize, cudaMemcpyDeviceToHost); } size_t Size() const { return size; } void zero() { cudaMemset(data, 0, size * ElemSize); } ~device_ptr() { cudaFree(data); } typename T::Scalar* data; size_t size; }; ================================================ FILE: examples/gru.cc ================================================ // Copyright 2020 LMNT, Inc. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // ============================================================================== #include #include #include #include #include #include #include #include #include #include #include #include #include "device_ptr.h" #include "haste.h" using haste::v0::gru::ForwardPass; using haste::v0::gru::BackwardPass; using std::string; using Tensor1 = Eigen::Tensor; using Tensor2 = Eigen::Tensor; using Tensor3 = Eigen::Tensor; constexpr int BATCH_SIZE = 64; constexpr int SEQUENCE_LEN = 1000; constexpr int HIDDEN_DIMS = 512; constexpr int INPUT_DIMS = 512; static cublasHandle_t g_blas_handle; class ScopeTimer { public: ScopeTimer(const string& msg) : msg_(msg) { cudaEventCreate(&start_); cudaEventCreate(&stop_); cudaDeviceSynchronize(); cudaEventRecord(start_); } ~ScopeTimer() { float elapsed_ms; cudaEventRecord(stop_); cudaEventSynchronize(stop_); cudaEventElapsedTime(&elapsed_ms, start_, stop_); printf("%s %fms\n", msg_.c_str(), elapsed_ms); cudaEventDestroy(start_); cudaEventDestroy(stop_); } private: string msg_; cudaEvent_t start_, stop_; }; void GruInference( const Tensor2& W, const Tensor2& R, const Tensor1& bx, const Tensor1& br, const Tensor3& x) { const int time_steps = x.dimension(2); const int batch_size = x.dimension(1); const int input_size = x.dimension(0); const int hidden_size = R.dimension(1); // Copy weights over to GPU. device_ptr W_dev(W); device_ptr R_dev(R); device_ptr bx_dev(bx); device_ptr br_dev(br); device_ptr x_dev(x); device_ptr h_dev((time_steps + 1) * batch_size * hidden_size); device_ptr tmp_Wx_dev(time_steps * batch_size * hidden_size * 3); device_ptr tmp_Rh_dev(batch_size * hidden_size * 3); h_dev.zero(); ScopeTimer t("Inference:"); ForwardPass forward = ForwardPass( false, // training batch_size, input_size, hidden_size, g_blas_handle); forward.Run( time_steps, W_dev.data, R_dev.data, bx_dev.data, br_dev.data, x_dev.data, h_dev.data, nullptr, tmp_Wx_dev.data, tmp_Rh_dev.data, 0.0f, nullptr); } void GruTrain( const Tensor2& W, const Tensor2& R, const Tensor1& bx, const Tensor1& br, const Tensor3& x, const Tensor3& dh_new) { const int time_steps = x.dimension(2); const int batch_size = x.dimension(1); const int input_size = x.dimension(0); const int hidden_size = R.dimension(1); // Copy weights over to GPU. device_ptr W_dev(W); device_ptr R_dev(R); device_ptr bx_dev(bx); device_ptr br_dev(br); device_ptr x_dev(x); device_ptr dh_new_dev(dh_new); device_ptr h_dev((time_steps + 1) * batch_size * hidden_size); device_ptr tmp_Wx_dev(time_steps * batch_size * hidden_size * 3); device_ptr tmp_Rh_dev(batch_size * hidden_size * 3); device_ptr v_dev(time_steps * batch_size * hidden_size * 4); h_dev.zero(); { ScopeTimer t("Train forward:"); ForwardPass forward = ForwardPass( true, // training batch_size, input_size, hidden_size, g_blas_handle); forward.Run( time_steps, W_dev.data, R_dev.data, bx_dev.data, br_dev.data, x_dev.data, h_dev.data, v_dev.data, tmp_Wx_dev.data, tmp_Rh_dev.data, 0.0f, nullptr); } device_ptr dx_dev(time_steps * batch_size * input_size); device_ptr dW_dev(input_size * hidden_size * 3); device_ptr dR_dev(hidden_size * hidden_size * 3); device_ptr dbx_dev(hidden_size * 3); device_ptr dbr_dev(hidden_size * 3); device_ptr dh_dev(batch_size * hidden_size); device_ptr dp_dev(time_steps * batch_size * hidden_size * 3); device_ptr dq_dev(time_steps * batch_size * hidden_size * 3); { ScopeTimer t("Train backward:"); BackwardPass backward( batch_size, input_size, hidden_size, g_blas_handle); backward.Run( time_steps, W_dev.data, R_dev.data, bx_dev.data, br_dev.data, x_dev.data, h_dev.data, v_dev.data, dh_new_dev.data, dx_dev.data, dW_dev.data, dR_dev.data, dbx_dev.data, dbr_dev.data, dh_dev.data, dp_dev.data, dq_dev.data, nullptr); } } int main() { srand(time(0)); cublasCreate(&g_blas_handle); // Weights. Tensor2 W(HIDDEN_DIMS * 3, INPUT_DIMS); Tensor2 R(HIDDEN_DIMS * 3, HIDDEN_DIMS); Tensor1 bx(HIDDEN_DIMS * 3); Tensor1 br(HIDDEN_DIMS * 3); // Input. Tensor3 x(INPUT_DIMS, BATCH_SIZE, SEQUENCE_LEN); // Gradients from upstream layers. Tensor3 dh(HIDDEN_DIMS, BATCH_SIZE, SEQUENCE_LEN + 1); W.setRandom(); R.setRandom(); bx.setRandom(); br.setRandom(); x.setRandom(); dh.setRandom(); GruInference(W, R, bx, br, x); GruTrain(W, R, bx, br, x, dh); cublasDestroy(g_blas_handle); return 0; } ================================================ FILE: examples/lstm.cc ================================================ // Copyright 2020 LMNT, Inc. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // ============================================================================== #include #include #include #include #include #include #include #include #include #include #include #include #include "device_ptr.h" #include "haste.h" using haste::v0::lstm::BackwardPass; using haste::v0::lstm::ForwardPass; using std::string; using Tensor1 = Eigen::Tensor; using Tensor2 = Eigen::Tensor; using Tensor3 = Eigen::Tensor; constexpr int BATCH_SIZE = 64; constexpr int SEQUENCE_LEN = 1000; constexpr int HIDDEN_DIMS = 512; constexpr int INPUT_DIMS = 512; static cublasHandle_t g_blas_handle; class ScopeTimer { public: ScopeTimer(const string& msg) : msg_(msg) { cudaEventCreate(&start_); cudaEventCreate(&stop_); cudaDeviceSynchronize(); cudaEventRecord(start_); } ~ScopeTimer() { float elapsed_ms; cudaEventRecord(stop_); cudaEventSynchronize(stop_); cudaEventElapsedTime(&elapsed_ms, start_, stop_); printf("%s %.1fms\n", msg_.c_str(), elapsed_ms); cudaEventDestroy(start_); cudaEventDestroy(stop_); } private: string msg_; cudaEvent_t start_, stop_; }; void LstmInference(const Tensor2& W, const Tensor2& R, const Tensor1& b, const Tensor3& x) { const int time_steps = x.dimension(2); const int batch_size = x.dimension(1); const int input_size = x.dimension(0); const int hidden_size = R.dimension(1); // Copy weights over to GPU. device_ptr W_dev(W); device_ptr R_dev(R); device_ptr b_dev(b); device_ptr x_dev(x); device_ptr h_dev((time_steps + 1) * batch_size * hidden_size); device_ptr c_dev((time_steps + 1) * batch_size * hidden_size); device_ptr v_dev(time_steps * batch_size * hidden_size * 4); device_ptr tmp_Rh_dev(batch_size * hidden_size * 4); h_dev.zero(); c_dev.zero(); ScopeTimer t("Inference:"); ForwardPass forward( false, // training batch_size, input_size, hidden_size, g_blas_handle); forward.Run( time_steps, W_dev.data, R_dev.data, b_dev.data, x_dev.data, h_dev.data, c_dev.data, v_dev.data, tmp_Rh_dev.data, 0.0f, // zoneout prob nullptr); // zoneout mask } void LstmTrain(const Tensor2& W, const Tensor2& R, const Tensor1& b, const Tensor3& x, const Tensor3& dh, const Tensor3& dc) { const int time_steps = x.dimension(2); const int batch_size = x.dimension(1); const int input_size = x.dimension(0); const int hidden_size = R.dimension(1); // Copy weights over to GPU. device_ptr W_dev(W); device_ptr R_dev(R); device_ptr b_dev(b); device_ptr x_dev(x); // This is nearly the same as the inference code except we have an extra dimension // for h and c. We'll store those outputs of the cell for all time steps and use // them during the backward pass below. device_ptr h_dev((time_steps + 1) * batch_size * hidden_size); device_ptr c_dev((time_steps + 1) * batch_size * hidden_size); device_ptr v_dev(batch_size * hidden_size * 4 * time_steps); device_ptr tmp_Rh_dev(batch_size * hidden_size * 4); h_dev.zero(); c_dev.zero(); { ScopeTimer t("Train forward:"); ForwardPass forward( true, // training batch_size, input_size, hidden_size, g_blas_handle); forward.Run( time_steps, W_dev.data, R_dev.data, b_dev.data, x_dev.data, h_dev.data, c_dev.data, v_dev.data, tmp_Rh_dev.data, 0.0f, // zoneout prob nullptr); // zoneout mask } Eigen::array transpose_x({ 1, 2, 0 }); Tensor3 x_t = x.shuffle(transpose_x); Eigen::array transpose({ 1, 0 }); Tensor2 W_t = W.shuffle(transpose); Tensor2 R_t = R.shuffle(transpose); device_ptr x_t_dev(x_t); device_ptr W_t_dev(W_t); device_ptr R_t_dev(R_t); // These gradients should actually come "from above" but we're just allocating // a bunch of uninitialized memory and passing it in. device_ptr dh_new_dev(dh); device_ptr dc_new_dev(dc); device_ptr dx_dev(time_steps * batch_size * input_size); device_ptr dW_dev(input_size * hidden_size * 4); device_ptr dR_dev(hidden_size * hidden_size * 4); device_ptr db_dev(hidden_size * 4); device_ptr dh_dev(batch_size * hidden_size); device_ptr dc_dev(batch_size * hidden_size); dW_dev.zero(); dR_dev.zero(); db_dev.zero(); dh_dev.zero(); dc_dev.zero(); { ScopeTimer t("Train backward:"); BackwardPass backward( batch_size, input_size, hidden_size, g_blas_handle); backward.Run( time_steps, W_t_dev.data, R_t_dev.data, b_dev.data, x_t_dev.data, h_dev.data, c_dev.data, dh_new_dev.data, dc_new_dev.data, dx_dev.data, dW_dev.data, dR_dev.data, db_dev.data, dh_dev.data, dc_dev.data, v_dev.data, nullptr); } } void LstmTrainIterative(const Tensor2& W, const Tensor2& R, const Tensor1& b, const Tensor3& x, const Tensor3& dh, const Tensor3& dc) { const int time_steps = x.dimension(2); const int batch_size = x.dimension(1); const int input_size = x.dimension(0); const int hidden_size = R.dimension(1); // Copy weights over to GPU. device_ptr W_dev(W); device_ptr R_dev(R); device_ptr b_dev(b); device_ptr x_dev(x); device_ptr h_dev((time_steps + 1) * batch_size * hidden_size); device_ptr c_dev((time_steps + 1) * batch_size * hidden_size); device_ptr v_dev(time_steps * batch_size * hidden_size * 4); device_ptr tmp_Rh_dev(batch_size * hidden_size * 4); h_dev.zero(); c_dev.zero(); { ScopeTimer t("Train forward (iterative):"); ForwardPass forward( true, // training batch_size, input_size, hidden_size, g_blas_handle); const int NC = batch_size * input_size; const int NH = batch_size * hidden_size; for (int t = 0; t < time_steps; ++t) { forward.Iterate( 0, W_dev.data, R_dev.data, b_dev.data, x_dev.data + t * NC, h_dev.data + t * NH, c_dev.data + t * NH, h_dev.data + (t + 1) * NH, c_dev.data + (t + 1) * NH, v_dev.data + t * NH * 4, tmp_Rh_dev.data, 0.0f, // zoneout prob nullptr); // zoneout mask } } Eigen::array transpose_x({ 1, 2, 0 }); Tensor3 x_t = x.shuffle(transpose_x); Eigen::array transpose({ 1, 0 }); Tensor2 W_t = W.shuffle(transpose); Tensor2 R_t = R.shuffle(transpose); device_ptr x_t_dev(x_t); device_ptr W_t_dev(W_t); device_ptr R_t_dev(R_t); // These gradients should actually come "from above" but we're just allocating // a bunch of uninitialized memory and passing it in. device_ptr dh_new_dev(dh); device_ptr dc_new_dev(dc); device_ptr dx_dev(time_steps * batch_size * input_size); device_ptr dW_dev(input_size * hidden_size * 4); device_ptr dR_dev(hidden_size * hidden_size * 4); device_ptr db_dev(hidden_size * 4); device_ptr dh_dev(batch_size * hidden_size); device_ptr dc_dev(batch_size * hidden_size); dW_dev.zero(); dR_dev.zero(); db_dev.zero(); dh_dev.zero(); dc_dev.zero(); { ScopeTimer t("Train backward (iterative):"); BackwardPass backward( batch_size, input_size, hidden_size, g_blas_handle); const int NC = batch_size * input_size; const int NH = batch_size * hidden_size; for (int t = time_steps - 1; t >= 0; --t) { backward.Iterate( 0, W_t_dev.data, R_t_dev.data, b_dev.data, x_t_dev.data + t * NC, h_dev.data + t * NH, c_dev.data + t * NH, c_dev.data + (t + 1) * NH, dh_new_dev.data + t * NH, dc_new_dev.data + t * NH, dx_dev.data + t * NC, dW_dev.data, dR_dev.data, db_dev.data, dh_dev.data, dc_dev.data, v_dev.data + t * NH * 4, nullptr); } } } int main() { srand(time(0)); cublasCreate(&g_blas_handle); // Weights. // W: input weight matrix // R: recurrent weight matrix // b: bias Tensor2 W(HIDDEN_DIMS * 4, INPUT_DIMS); Tensor2 R(HIDDEN_DIMS * 4, HIDDEN_DIMS); Tensor1 b(HIDDEN_DIMS * 4); // Input. Tensor3 x(INPUT_DIMS, BATCH_SIZE, SEQUENCE_LEN); // Gradients from upstream layers. Tensor3 dh(HIDDEN_DIMS, BATCH_SIZE, SEQUENCE_LEN + 1); Tensor3 dc(HIDDEN_DIMS, BATCH_SIZE, SEQUENCE_LEN + 1); W.setRandom(); R.setRandom(); b.setRandom(); x.setRandom(); dh.setRandom(); dc.setRandom(); LstmInference(W, R, b, x); LstmTrain(W, R, b, x, dh, dc); LstmTrainIterative(W, R, b, x, dh, dc); cublasDestroy(g_blas_handle); return 0; } ================================================ FILE: frameworks/pytorch/__init__.py ================================================ # Copyright 2020 LMNT, Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== """ Haste: a fast, simple, and open RNN library. """ import torch as _ from ._version import __version__ # generated in setup.py from .gru import GRU from .indrnn import IndRNN from .lstm import LSTM from .layer_norm_gru import LayerNormGRU from .layer_norm_indrnn import LayerNormIndRNN from .layer_norm_lstm import LayerNormLSTM __all__ = [ 'GRU', 'IndRNN', 'LSTM', 'LayerNormGRU', 'LayerNormIndRNN', 'LayerNormLSTM' ] ================================================ FILE: frameworks/pytorch/base_rnn.py ================================================ # Copyright 2020 LMNT, Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== import torch import torch.nn as nn __all__ = [ 'BaseRNN' ] class BaseRNN(nn.Module): def __init__( self, input_size, hidden_size, batch_first, zoneout, return_state_sequence): super().__init__() self.input_size = input_size self.hidden_size = hidden_size self.batch_first = batch_first self.zoneout = zoneout self.return_state_sequence = return_state_sequence def _permute(self, x): if self.batch_first: return x.permute(1, 0, 2) return x def _get_state(self, input, state, state_shape): if state is None: state = _zero_state(input, state_shape) else: _validate_state(state, state_shape) return state def _get_final_state(self, state, lengths): if isinstance(state, tuple): return tuple(self._get_final_state(s, lengths) for s in state) if isinstance(state, list): return [self._get_final_state(s, lengths) for s in state] if self.return_state_sequence: return self._permute(state[1:]).unsqueeze(0) if lengths is not None: cols = range(state.size(1)) return state[[lengths, cols]].unsqueeze(0) return state[-1].unsqueeze(0) def _get_zoneout_mask(self, input): if self.zoneout: zoneout_mask = input.new_empty(input.shape[0], input.shape[1], self.hidden_size) zoneout_mask.bernoulli_(1.0 - self.zoneout) else: zoneout_mask = input.new_empty(0, 0, 0) return zoneout_mask def _is_cuda(self): is_cuda = [tensor.is_cuda for tensor in list(self.parameters())] if any(is_cuda) and not all(is_cuda): raise ValueError('RNN tensors should all be CUDA tensors or none should be CUDA tensors') return any(is_cuda) def _validate_state(state, state_shape): """ Checks to make sure that `state` has the same nested structure and dimensions as `state_shape`. `None` values in `state_shape` are a wildcard and are not checked. Arguments: state: a nested structure of Tensors. state_shape: a nested structure of integers or None. Raises: ValueError: if the structure and/or shapes don't match. """ if isinstance(state, (tuple, list)): # Handle nested structure. if not isinstance(state_shape, (tuple, list)): raise ValueError('RNN state has invalid structure; expected {}'.format(state_shape)) for s, ss in zip(state, state_shape): _validate_state(s, ss) else: shape = list(state.size()) if len(shape) != len(state_shape): raise ValueError('RNN state dimension mismatch; expected {} got {}'.format(len(state_shape), len(shape))) for i, (d1, d2) in enumerate(zip(list(state.size()), state_shape)): if d2 is not None and d1 != d2: raise ValueError('RNN state size mismatch on dim {}; expected {} got {}'.format(i, d2, d1)) def _zero_state(input, state_shape): """ Returns a nested structure of zero Tensors with the same structure and shape as `state_shape`. The returned Tensors will have the same dtype and be on the same device as `input`. Arguments: input: Tensor, to specify the device and dtype of the returned tensors. shape_state: nested structure of integers. Returns: zero_state: a nested structure of zero Tensors. Raises: ValueError: if `state_shape` has non-integer values. """ if isinstance(state_shape, (tuple, list)) and isinstance(state_shape[0], int): state = input.new_zeros(*state_shape) elif isinstance(state_shape, tuple): state = tuple(_zero_state(input, s) for s in state_shape) elif isinstance(state_shape, list): state = [_zero_state(input, s) for s in state_shape] else: raise ValueError('RNN state_shape is invalid') return state ================================================ FILE: frameworks/pytorch/gru.cc ================================================ // Copyright 2020 LMNT, Inc. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // ============================================================================== #include #include #include #include #include "haste.h" #include "support.h" namespace { using haste::v0::gru::ForwardPass; using haste::v0::gru::BackwardPass; using torch::Tensor; std::vector gru_forward( bool training, float zoneout_prob, Tensor x, Tensor h0, Tensor kernel, Tensor recurrent_kernel, Tensor bias, Tensor recurrent_bias, Tensor zoneout_mask) { const auto time_steps = x.size(0); const auto batch_size = x.size(1); const auto input_size = x.size(2); const auto hidden_size = recurrent_kernel.size(0); const bool has_zoneout = zoneout_prob && zoneout_mask.size(0); CHECK_INPUT(x); CHECK_INPUT(h0); CHECK_INPUT(kernel); CHECK_INPUT(recurrent_kernel); CHECK_INPUT(bias); CHECK_INPUT(recurrent_bias); CHECK_INPUT(zoneout_mask); const auto options = x.options(); const at::cuda::CUDAGuard guard(options.device_index()); Tensor output = torch::empty({ time_steps + 1, batch_size, hidden_size }, options); Tensor cache = torch::empty({ time_steps, batch_size, hidden_size * 4 }, options); Tensor tmp_Wx = torch::empty({ time_steps, batch_size, hidden_size * 3 }, options); Tensor tmp_Rh = torch::empty({ batch_size, hidden_size * 3 }, options); output[0] = h0; AT_DISPATCH_FLOATING_TYPES_AND_HALF(x.scalar_type(), "gru_forward", ([&] { ForwardPass::T> forward( training, batch_size, input_size, hidden_size, at::cuda::getCurrentCUDABlasHandle(), at::cuda::getCurrentCUDAStream()); forward.Run( time_steps, ptr(kernel), ptr(recurrent_kernel), ptr(bias), ptr(recurrent_bias), ptr(x), ptr(output), ptr(cache), ptr(tmp_Wx), ptr(tmp_Rh), has_zoneout ? zoneout_prob : 0.0f, has_zoneout ? ptr(zoneout_mask) : nullptr); })); return { output, cache }; } std::vector gru_backward( Tensor x_t, Tensor kernel_t, Tensor recurrent_kernel_t, Tensor bias, Tensor recurrent_bias, Tensor zoneout_mask, Tensor h, Tensor cache, Tensor dh_new) { const auto input_size = x_t.size(0); const auto time_steps = x_t.size(1); const auto batch_size = x_t.size(2); const auto hidden_size = recurrent_kernel_t.size(1); const bool has_zoneout = !!zoneout_mask.size(0); CHECK_INPUT(x_t); CHECK_INPUT(kernel_t); CHECK_INPUT(recurrent_kernel_t); CHECK_INPUT(bias); CHECK_INPUT(recurrent_bias); CHECK_INPUT(h); CHECK_INPUT(cache); CHECK_INPUT(dh_new); CHECK_INPUT(zoneout_mask); const auto options = x_t.options(); const at::cuda::CUDAGuard guard(options.device_index()); Tensor dx = torch::empty({ time_steps, batch_size, input_size }, options); Tensor dW = torch::zeros({ input_size, hidden_size * 3 }, options); Tensor dR = torch::zeros({ hidden_size, hidden_size * 3 }, options); Tensor dbx = torch::zeros({ hidden_size * 3 }, options); Tensor dbr = torch::zeros({ hidden_size * 3 }, options); Tensor dh = torch::zeros({ batch_size, hidden_size }, options); Tensor dp = torch::empty({ time_steps, batch_size, hidden_size * 3 }, options); Tensor dq = torch::empty({ time_steps, batch_size, hidden_size * 3 }, options); AT_DISPATCH_FLOATING_TYPES_AND_HALF(x_t.scalar_type(), "gru_backward", ([&] { BackwardPass::T> backward( batch_size, input_size, hidden_size, at::cuda::getCurrentCUDABlasHandle(), at::cuda::getCurrentCUDAStream()); backward.Run( time_steps, ptr(kernel_t), ptr(recurrent_kernel_t), ptr(bias), ptr(recurrent_bias), ptr(x_t), ptr(h), ptr(cache), ptr(dh_new), ptr(dx), ptr(dW), ptr(dR), ptr(dbx), ptr(dbr), ptr(dh), ptr(dp), ptr(dq), has_zoneout ? ptr(zoneout_mask) : nullptr); })); return { dx, dh, dW, dR, dbx, dbr }; } } // anonymous namespace void gru_init(py::module& m) { m.def("gru_forward", &gru_forward, "GRU forward", py::call_guard()); m.def("gru_backward", &gru_backward, "GRU backward", py::call_guard()); } ================================================ FILE: frameworks/pytorch/gru.py ================================================ # Copyright 2020 LMNT, Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== """Gated Recurrent Unit""" import haste_pytorch_lib as LIB import torch import torch.nn as nn import torch.nn.functional as F from .base_rnn import BaseRNN __all__ = [ 'GRU' ] #@torch.jit.script def GRUScript( training: bool, zoneout_prob: float, input, h0, kernel, recurrent_kernel, bias, recurrent_bias, zoneout_mask): time_steps = input.shape[0] batch_size = input.shape[1] hidden_size = recurrent_kernel.shape[0] h = [h0] Wx = input @ kernel + bias for t in range(time_steps): Rh = h[t] @ recurrent_kernel + recurrent_bias vx = torch.chunk(Wx[t], 3, 1) vh = torch.chunk(Rh, 3, 1) z = torch.sigmoid(vx[0] + vh[0]) r = torch.sigmoid(vx[1] + vh[1]) g = torch.tanh(vx[2] + r * vh[2]) h.append(z * h[t] + (1 - z) * g) if zoneout_prob: if training: h[-1] = (h[-1] - h[-2]) * zoneout_mask[t] + h[-2] else: h[-1] = zoneout_prob * h[-2] + (1 - zoneout_prob) * h[-1] h = torch.stack(h) return h class GRUFunction(torch.autograd.Function): @staticmethod def forward(ctx, training, zoneout_prob, *inputs): h, cache = LIB.gru_forward(training, zoneout_prob, *inputs) ctx.save_for_backward(inputs[0], *inputs[2:], h, cache) ctx.mark_non_differentiable(inputs[-1]) ctx.training = training return h @staticmethod def backward(ctx, grad_h): if not ctx.training: raise RuntimeError('GRU backward can only be called in training mode') saved = [*ctx.saved_tensors] saved[0] = saved[0].permute(2, 0, 1).contiguous() saved[1] = saved[1].permute(1, 0).contiguous() saved[2] = saved[2].permute(1, 0).contiguous() grads = LIB.gru_backward(*saved, grad_h.contiguous()) return (None, None, *grads, None) class GRU(BaseRNN): """ Gated Recurrent Unit layer. This GRU layer offers a fused, GPU-accelerated PyTorch op for inference and training. There are two commonly-used variants of GRU cells. This one implements 1406.1078v1 which applies the reset gate to the hidden state after matrix multiplication. cuDNN also implements this variant. The other variant, 1406.1078v3, applies the reset gate before matrix multiplication and is currently unsupported. This layer has built-in support for DropConnect and Zoneout, which are both techniques used to regularize RNNs. See [\_\_init\_\_](#__init__) and [forward](#forward) for usage. See [from_native_weights](#from_native_weights) and [to_native_weights](#to_native_weights) for compatibility with PyTorch GRUs. """ def __init__(self, input_size, hidden_size, batch_first=False, dropout=0.0, zoneout=0.0, return_state_sequence=False): """ Initialize the parameters of the GRU layer. Arguments: input_size: int, the feature dimension of the input. hidden_size: int, the feature dimension of the output. batch_first: (optional) bool, if `True`, then the input and output tensors are provided as `(batch, seq, feature)`. dropout: (optional) float, sets the dropout rate for DropConnect regularization on the recurrent matrix. zoneout: (optional) float, sets the zoneout rate for Zoneout regularization. return_state_sequence: (optional) bool, if `True`, the forward pass will return the entire state sequence instead of just the final state. Note that if the input is a padded sequence, the returned state will also be a padded sequence. Variables: kernel: the input projection weight matrix. Dimensions (input_size, hidden_size * 3) with `z,r,h` gate layout. Initialized with Xavier uniform initialization. recurrent_kernel: the recurrent projection weight matrix. Dimensions (hidden_size, hidden_size * 3) with `z,r,h` gate layout. Initialized with orthogonal initialization. bias: the input projection bias vector. Dimensions (hidden_size * 3) with `z,r,h` gate layout. Initialized to zeros. recurrent_bias: the recurrent projection bias vector. Dimensions (hidden_size * 3) with `z,r,h` gate layout. Initialized to zeros. """ super().__init__(input_size, hidden_size, batch_first, zoneout, return_state_sequence) if dropout < 0 or dropout > 1: raise ValueError('GRU: dropout must be in [0.0, 1.0]') if zoneout < 0 or zoneout > 1: raise ValueError('GRU: zoneout must be in [0.0, 1.0]') self.dropout = dropout self.kernel = nn.Parameter(torch.empty(input_size, hidden_size * 3)) self.recurrent_kernel = nn.Parameter(torch.empty(hidden_size, hidden_size * 3)) self.bias = nn.Parameter(torch.empty(hidden_size * 3)) self.recurrent_bias = nn.Parameter(torch.empty(hidden_size * 3)) self.reset_parameters() def to_native_weights(self): """ Converts Haste GRU weights to native PyTorch GRU weights. Returns: weight_ih_l0: Parameter, the input-hidden weights of the GRU layer. weight_hh_l0: Parameter, the hidden-hidden weights of the GRU layer. bias_ih_l0: Parameter, the input-hidden bias of the GRU layer. bias_hh_l0: Parameter, the hidden-hidden bias of the GRU layer. """ def reorder_weights(w): z, r, n = torch.chunk(w, 3, dim=-1) return torch.cat([r, z, n], dim=-1) kernel = reorder_weights(self.kernel).permute(1, 0).contiguous() recurrent_kernel = reorder_weights(self.recurrent_kernel).permute(1, 0).contiguous() bias1 = reorder_weights(self.bias).contiguous() bias2 = reorder_weights(self.recurrent_bias).contiguous() kernel = torch.nn.Parameter(kernel) recurrent_kernel = torch.nn.Parameter(recurrent_kernel) bias1 = torch.nn.Parameter(bias1) bias2 = torch.nn.Parameter(bias2) return kernel, recurrent_kernel, bias1, bias2 def from_native_weights(self, weight_ih_l0, weight_hh_l0, bias_ih_l0, bias_hh_l0): """ Copies and converts the provided PyTorch GRU weights into this layer. Arguments: weight_ih_l0: Parameter, the input-hidden weights of the PyTorch GRU layer. weight_hh_l0: Parameter, the hidden-hidden weights of the PyTorch GRU layer. bias_ih_l0: Parameter, the input-hidden bias of the PyTorch GRU layer. bias_hh_l0: Parameter, the hidden-hidden bias of the PyTorch GRU layer. """ def reorder_weights(w): r, z, n = torch.chunk(w, 3, axis=-1) return torch.cat([z, r, n], dim=-1) kernel = reorder_weights(weight_ih_l0.permute(1, 0)).contiguous() recurrent_kernel = reorder_weights(weight_hh_l0.permute(1, 0)).contiguous() bias = reorder_weights(bias_ih_l0).contiguous() recurrent_bias = reorder_weights(bias_hh_l0).contiguous() self.kernel = nn.Parameter(kernel) self.recurrent_kernel = nn.Parameter(recurrent_kernel) self.bias = nn.Parameter(bias) self.recurrent_bias = nn.Parameter(recurrent_bias) def reset_parameters(self): """Resets this layer's parameters to their initial values.""" hidden_size = self.hidden_size for i in range(3): nn.init.xavier_uniform_(self.kernel[:, i*hidden_size:(i+1)*hidden_size]) nn.init.orthogonal_(self.recurrent_kernel[:, i*hidden_size:(i+1)*hidden_size]) nn.init.zeros_(self.bias) nn.init.zeros_(self.recurrent_bias) def forward(self, input, state=None, lengths=None): """ Runs a forward pass of the GRU layer. Arguments: input: Tensor, a batch of input sequences to pass through the GRU. Dimensions (seq_len, batch_size, input_size) if `batch_first` is `False`, otherwise (batch_size, seq_len, input_size). lengths: (optional) Tensor, list of sequence lengths for each batch element. Dimension (batch_size). This argument may be omitted if all batch elements are unpadded and have the same sequence length. Returns: output: Tensor, the output of the GRU layer. Dimensions (seq_len, batch_size, hidden_size) if `batch_first` is `False` (default) or (batch_size, seq_len, hidden_size) if `batch_first` is `True`. Note that if `lengths` was specified, the `output` tensor will not be masked. It's the caller's responsibility to either not use the invalid entries or to mask them out before using them. h_n: the hidden state for the last sequence item. Dimensions (1, batch_size, hidden_size). """ input = self._permute(input) state_shape = [1, input.shape[1], self.hidden_size] h0 = self._get_state(input, state, state_shape) h = self._impl(input, h0[0], self._get_zoneout_mask(input)) state = self._get_final_state(h, lengths) output = self._permute(h[1:]) return output, state def _impl(self, input, state, zoneout_mask): if self._is_cuda(): return GRUFunction.apply( self.training, self.zoneout, input.contiguous(), state.contiguous(), self.kernel.contiguous(), F.dropout(self.recurrent_kernel, self.dropout, self.training).contiguous(), self.bias.contiguous(), self.recurrent_bias.contiguous(), zoneout_mask.contiguous()) else: return GRUScript( self.training, self.zoneout, input.contiguous(), state.contiguous(), self.kernel.contiguous(), F.dropout(self.recurrent_kernel, self.dropout, self.training).contiguous(), self.bias.contiguous(), self.recurrent_bias.contiguous(), zoneout_mask.contiguous()) ================================================ FILE: frameworks/pytorch/indrnn.cc ================================================ // Copyright 2020 LMNT, Inc. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // ============================================================================== #include #include #include #include #include "haste.h" #include "support.h" namespace { using haste::v0::indrnn::ForwardPass; using haste::v0::indrnn::BackwardPass; using torch::Tensor; Tensor indrnn_forward( bool training, float zoneout_prob, Tensor x, Tensor h0, Tensor kernel, Tensor recurrent_scale, Tensor bias, Tensor zoneout_mask) { const auto time_steps = x.size(0); const auto batch_size = x.size(1); const auto input_size = x.size(2); const auto hidden_size = recurrent_scale.size(0); const bool has_zoneout = zoneout_prob && zoneout_mask.size(0); CHECK_INPUT(x); CHECK_INPUT(h0); CHECK_INPUT(kernel); CHECK_INPUT(recurrent_scale); CHECK_INPUT(bias); CHECK_INPUT(zoneout_mask); const auto options = x.options(); const at::cuda::CUDAGuard guard(options.device_index()); Tensor output = torch::empty({ time_steps + 1, batch_size, hidden_size }, options); Tensor workspace = torch::empty({ time_steps, batch_size, hidden_size }, options); output[0] = h0; AT_DISPATCH_FLOATING_TYPES(x.scalar_type(), "indrnn_forward", ([&] { ForwardPass forward( training, batch_size, input_size, hidden_size, at::cuda::getCurrentCUDABlasHandle(), at::cuda::getCurrentCUDAStream()); forward.Run( time_steps, kernel.data_ptr(), recurrent_scale.data_ptr(), bias.data_ptr(), x.data_ptr(), output.data_ptr(), workspace.data_ptr(), has_zoneout ? zoneout_prob : 0.0f, has_zoneout ? zoneout_mask.data_ptr() : nullptr); })); return output; } std::vector indrnn_backward( Tensor x_t, Tensor kernel_t, Tensor recurrent_scale, Tensor bias, Tensor zoneout_mask, Tensor h, Tensor dh_new) { const auto input_size = x_t.size(0); const auto time_steps = x_t.size(1); const auto batch_size = x_t.size(2); const auto hidden_size = recurrent_scale.size(0); const bool has_zoneout = !!zoneout_mask.size(0); CHECK_INPUT(x_t); CHECK_INPUT(kernel_t); CHECK_INPUT(recurrent_scale); CHECK_INPUT(bias); CHECK_INPUT(zoneout_mask); CHECK_INPUT(h); CHECK_INPUT(dh_new); const auto options = x_t.options(); const at::cuda::CUDAGuard guard(options.device_index()); Tensor dx = torch::empty({ time_steps, batch_size, input_size }, options); Tensor dW = torch::zeros({ input_size, hidden_size }, options); Tensor du = torch::zeros({ hidden_size }, options); Tensor db = torch::zeros_like(bias); Tensor dh = torch::zeros({ batch_size, hidden_size }, options); Tensor workspace = torch::empty({ time_steps, batch_size, hidden_size }, options); AT_DISPATCH_FLOATING_TYPES(x_t.scalar_type(), "indrnn_backward", ([&] { BackwardPass backward( batch_size, input_size, hidden_size, at::cuda::getCurrentCUDABlasHandle(), at::cuda::getCurrentCUDAStream()); backward.Run( time_steps, kernel_t.data_ptr(), recurrent_scale.data_ptr(), bias.data_ptr(), x_t.data_ptr(), h.data_ptr(), dh_new.data_ptr(), dx.data_ptr(), dW.data_ptr(), du.data_ptr(), db.data_ptr(), dh.data_ptr(), workspace.data_ptr(), has_zoneout ? zoneout_mask.data_ptr() : nullptr); })); return { dx, dh, dW, du, db }; } } // anonymous namespace void indrnn_init(py::module& m) { m.def("indrnn_forward", &indrnn_forward, "IndRNN forward", py::call_guard()); m.def("indrnn_backward", &indrnn_backward, "IndRNN backward", py::call_guard()); } ================================================ FILE: frameworks/pytorch/indrnn.py ================================================ # Copyright 2020 LMNT, Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== """Independently Recurrent Neural Network""" import haste_pytorch_lib as LIB import torch import torch.nn as nn import torch.nn.functional as F from .base_rnn import BaseRNN __all__ = [ 'IndRNN' ] #@torch.jit.script def IndRNNScript( training: bool, zoneout_prob: float, input, h0, kernel, recurrent_scale, bias, zoneout_mask): time_steps = input.shape[0] h = [h0] Wx = input @ kernel + bias for t in range(time_steps): h.append(torch.tanh(Wx[t] + h[-1] * recurrent_scale)) if zoneout_prob: if training: h[-1] = (h[-1] - h[-2]) * zoneout_mask[t] + h[-2] else: h[-1] = zoneout_prob * h[-2] + (1 - zoneout_prob) * h[-1] h = torch.stack(h) return h class IndRNNFunction(torch.autograd.Function): @staticmethod def forward(ctx, training, zoneout_prob, *inputs): h = LIB.indrnn_forward(training, zoneout_prob, *inputs) ctx.save_for_backward(inputs[0], *inputs[2:], h) ctx.training = training return h @staticmethod def backward(ctx, grad_h): if not ctx.training: raise RuntimeError('IndRNN backward can only be called in training mode') saved = [*ctx.saved_tensors] saved[0] = saved[0].permute(2, 0, 1).contiguous() saved[1] = saved[1].permute(1, 0).contiguous() grads = LIB.indrnn_backward(*saved, grad_h.contiguous()) return (None, None, *grads, None) class IndRNN(BaseRNN): """ Independently Recurrent Neural Network layer. This layer offers a fused, GPU-accelerated PyTorch op for inference and training. It also supports Zoneout regularization. See [\_\_init\_\_](#__init__) and [forward](#forward) for usage. """ def __init__( self, input_size, hidden_size, batch_first=False, zoneout=0.0, return_state_sequence=False): """ Initialize the parameters of the IndRNN layer. Arguments: input_size: int, the feature dimension of the input. hidden_size: int, the feature dimension of the output. batch_first: (optional) bool, if `True`, then the input and output tensors are provided as `(batch, seq, feature)`. zoneout: (optional) float, sets the zoneout rate for Zoneout regularization. return_state_sequence: (optional) bool, if `True`, the forward pass will return the entire state sequence instead of just the final state. Note that if the input is a padded sequence, the returned state will also be a padded sequence. Variables: kernel: the input projection weight matrix. Dimensions (input_size, hidden_size). Initialized with Xavier uniform initialization. recurrent_scale: the recurrent scale weight vector. Dimensions (hidden_size). Initialized uniformly in [-0.5, 0.5]. Note that this initialization scheme is different than in the original authors' implementation. See https://github.com/lmnt-com/haste/issues/7 for details. bias: the RNN bias vector. Dimensions (hidden_size). Initialized to zeros. """ super().__init__(input_size, hidden_size, batch_first, zoneout, return_state_sequence) if zoneout < 0 or zoneout > 1: raise ValueError('IndRNN: zoneout must be in [0.0, 1.0]') self.input_size = input_size self.hidden_size = hidden_size self.batch_first = batch_first self.zoneout = zoneout self.kernel = nn.Parameter(torch.empty(input_size, hidden_size)) self.recurrent_scale = nn.Parameter(torch.empty(hidden_size)) self.bias = nn.Parameter(torch.empty(hidden_size)) self.reset_parameters() def reset_parameters(self): nn.init.xavier_uniform_(self.kernel) nn.init.uniform_(self.recurrent_scale, -0.5, 0.5) nn.init.zeros_(self.bias) def forward(self, input, state=None, lengths=None): """ Runs a forward pass of the IndRNN layer. Arguments: input: Tensor, a batch of input sequences to pass through the GRU. Dimensions (seq_len, batch_size, input_size) if `batch_first` is `False`, otherwise (batch_size, seq_len, input_size). state: (optional) Tensor, the initial state for each batch element in `input`. Dimensions (1, batch_size, hidden_size). Defaults to zeros. lengths: (optional) Tensor, list of sequence lengths for each batch element. Dimension (batch_size). This argument may be omitted if all batch elements are unpadded and have the same sequence length. Returns: output: Tensor, the output of the GRU layer. Dimensions (seq_len, batch_size, hidden_size) if `batch_first` is `False` (default) or (batch_size, seq_len, hidden_size) if `batch_first` is `True`. Note that if `lengths` was specified, the `output` tensor will not be masked. It's the caller's responsibility to either not use the invalid entries or to mask them out before using them. state: the hidden state for the last sequence item. Dimensions (1, batch_size, hidden_size). """ input = self._permute(input) state_shape = [1, input.shape[1], self.hidden_size] h0 = self._get_state(input, state, state_shape) h = self._impl(input, h0[0], self._get_zoneout_mask(input)) state = self._get_final_state(h, lengths) output = self._permute(h[1:]) return output, state def _impl(self, input, state, zoneout_mask): if self._is_cuda(): return IndRNNFunction.apply( self.training, self.zoneout, input.contiguous(), state.contiguous(), self.kernel.contiguous(), self.recurrent_scale.contiguous(), self.bias.contiguous(), zoneout_mask.contiguous()) else: return IndRNNScript( self.training, self.zoneout, input.contiguous(), state.contiguous(), self.kernel.contiguous(), self.recurrent_scale.contiguous(), self.bias.contiguous(), zoneout_mask.contiguous()) ================================================ FILE: frameworks/pytorch/layer_norm_gru.cc ================================================ // Copyright 2020 LMNT, Inc. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // ============================================================================== #include #include #include #include #include "haste.h" #include "support.h" namespace { namespace layer_norm = haste::v0::layer_norm; namespace layer_norm_gru = haste::v0::layer_norm_gru; using torch::Tensor; std::vector layer_norm_gru_forward( bool training, float zoneout_prob, Tensor x, Tensor h0, Tensor kernel, Tensor recurrent_kernel, Tensor bias, Tensor recurrent_bias, Tensor gamma, Tensor zoneout_mask) { const auto time_steps = x.size(0); const auto batch_size = x.size(1); const auto input_size = x.size(2); const auto hidden_size = recurrent_kernel.size(0); const bool has_zoneout = zoneout_prob && zoneout_mask.size(0); CHECK_INPUT(x); CHECK_INPUT(h0); CHECK_INPUT(kernel); CHECK_INPUT(recurrent_kernel); CHECK_INPUT(bias); CHECK_INPUT(recurrent_bias); CHECK_INPUT(gamma); CHECK_INPUT(zoneout_mask); const auto options = x.options(); const at::cuda::CUDAGuard guard(options.device_index()); Tensor output = torch::empty({ time_steps + 1, batch_size, hidden_size }, options); Tensor cache = torch::empty({ time_steps, batch_size, hidden_size * 4 }, options); Tensor act_Wx = torch::empty({ time_steps, batch_size, hidden_size * 3 }, options); Tensor tmp_Wx_norm = torch::empty({ time_steps, batch_size, hidden_size * 3 }, options); Tensor act_Wx_norm_cache = torch::empty({ time_steps, batch_size, 2 }, options); Tensor act_Rh = torch::empty({ time_steps, batch_size, hidden_size * 3 }, options); Tensor tmp_Rh_norm = torch::empty({ batch_size, hidden_size * 3 }, options); Tensor act_Rh_norm_cache = torch::empty({ time_steps, batch_size, 2 }, options); output[0] = h0; AT_DISPATCH_FLOATING_TYPES(x.scalar_type(), "layer_norm_gru_forward", ([&] { auto gamma_a = gamma.packed_accessor32(); layer_norm::ForwardPass layer_norm1( time_steps * batch_size, hidden_size * 3, gamma_a[0].data(), nullptr, act_Wx_norm_cache.data_ptr()); layer_norm::ForwardPass layer_norm2( time_steps * batch_size, hidden_size * 3, gamma_a[1].data(), nullptr, act_Rh_norm_cache.data_ptr()); layer_norm_gru::ForwardPass gru( training, batch_size, input_size, hidden_size, at::cuda::getCurrentCUDABlasHandle(), at::cuda::getCurrentCUDAStream()); gru.Run( time_steps, kernel.data_ptr(), recurrent_kernel.data_ptr(), bias.data_ptr(), recurrent_bias.data_ptr(), x.data_ptr(), output.data_ptr(), cache.data_ptr(), act_Wx.data_ptr(), layer_norm1, tmp_Wx_norm.data_ptr(), act_Rh.data_ptr(), layer_norm2, tmp_Rh_norm.data_ptr(), has_zoneout ? zoneout_prob : 0.0f, has_zoneout ? zoneout_mask.data_ptr() : nullptr); })); return { output, cache, act_Wx, act_Wx_norm_cache, act_Rh, act_Rh_norm_cache }; } std::vector layer_norm_gru_backward( Tensor x_t, Tensor kernel_t, Tensor recurrent_kernel_t, Tensor bias, Tensor recurrent_bias, Tensor gamma, Tensor zoneout_mask, Tensor h, Tensor cache, Tensor act_Wx, Tensor act_Wx_norm_cache, Tensor act_Rh, Tensor act_Rh_norm_cache, Tensor dh_new) { const auto input_size = x_t.size(0); const auto time_steps = x_t.size(1); const auto batch_size = x_t.size(2); const auto hidden_size = recurrent_kernel_t.size(1); const bool has_zoneout = !!zoneout_mask.size(0); CHECK_INPUT(x_t); CHECK_INPUT(kernel_t); CHECK_INPUT(recurrent_kernel_t); CHECK_INPUT(bias); CHECK_INPUT(recurrent_bias); CHECK_INPUT(gamma); CHECK_INPUT(h); CHECK_INPUT(cache); CHECK_INPUT(act_Wx); CHECK_INPUT(act_Wx_norm_cache); CHECK_INPUT(act_Rh); CHECK_INPUT(act_Rh_norm_cache); CHECK_INPUT(dh_new); CHECK_INPUT(zoneout_mask); const auto options = x_t.options(); const at::cuda::CUDAGuard guard(options.device_index()); Tensor dx = torch::empty({ time_steps, batch_size, input_size }, options); Tensor dW = torch::zeros({ input_size, hidden_size * 3 }, options); Tensor dR = torch::zeros({ hidden_size, hidden_size * 3 }, options); Tensor dbx = torch::zeros({ hidden_size * 3 }, options); Tensor dbr = torch::zeros({ hidden_size * 3 }, options); Tensor dh = torch::zeros({ batch_size, hidden_size }, options); Tensor dp = torch::empty({ time_steps, batch_size, hidden_size * 3 }, options); Tensor dq = torch::empty({ time_steps, batch_size, hidden_size * 3 }, options); Tensor dgamma = torch::zeros_like(gamma); AT_DISPATCH_FLOATING_TYPES(x_t.scalar_type(), "layer_norm_gru_backward", ([&] { auto gamma_a = gamma.packed_accessor32(); auto dgamma_a = dgamma.packed_accessor32(); layer_norm::BackwardPass layer_norm1( time_steps * batch_size, hidden_size * 3, gamma_a[0].data(), nullptr, act_Wx.data_ptr(), dgamma_a[0].data(), nullptr, act_Wx_norm_cache.data_ptr()); layer_norm::BackwardPass layer_norm2( time_steps * batch_size, hidden_size * 3, gamma_a[1].data(), nullptr, act_Rh.data_ptr(), dgamma_a[1].data(), nullptr, act_Rh_norm_cache.data_ptr()); layer_norm_gru::BackwardPass gru( batch_size, input_size, hidden_size, at::cuda::getCurrentCUDABlasHandle(), at::cuda::getCurrentCUDAStream()); gru.Run( time_steps, kernel_t.data_ptr(), recurrent_kernel_t.data_ptr(), bias.data_ptr(), recurrent_bias.data_ptr(), x_t.data_ptr(), h.data_ptr(), cache.data_ptr(), dh_new.data_ptr(), dx.data_ptr(), dW.data_ptr(), dR.data_ptr(), dbx.data_ptr(), dbr.data_ptr(), dh.data_ptr(), dp.data_ptr(), dq.data_ptr(), layer_norm1, layer_norm2, has_zoneout ? zoneout_mask.data_ptr() : nullptr); })); return { dx, dh, dW, dR, dbx, dbr, dgamma }; } } // anonymous namespace void layer_norm_gru_init(py::module& m) { m.def("layer_norm_gru_forward", &layer_norm_gru_forward, "LayerNormGRU forward", py::call_guard()); m.def("layer_norm_gru_backward", &layer_norm_gru_backward, "LayerNormGRU backward", py::call_guard()); } ================================================ FILE: frameworks/pytorch/layer_norm_gru.py ================================================ # Copyright 2020 LMNT, Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== """Layer Normalized Gated Recurrent Unit""" import haste_pytorch_lib as LIB import torch import torch.nn as nn import torch.nn.functional as F from .base_rnn import BaseRNN __all__ = [ 'LayerNormGRU' ] #@torch.jit.script def LayerNormGRUScript( training: bool, zoneout_prob: float, input, h0, kernel, recurrent_kernel, bias, recurrent_bias, gamma, zoneout_mask): time_steps = input.shape[0] batch_size = input.shape[1] hidden_size = recurrent_kernel.shape[0] h = [h0] Wx = F.layer_norm(input @ kernel, (hidden_size * 3,), weight=gamma[0]) + bias for t in range(time_steps): Rh = F.layer_norm(h[t] @ recurrent_kernel, (hidden_size * 3,), weight=gamma[1]) + recurrent_bias vx = torch.chunk(Wx[t], 3, 1) vh = torch.chunk(Rh, 3, 1) z = torch.sigmoid(vx[0] + vh[0]) r = torch.sigmoid(vx[1] + vh[1]) g = torch.tanh(vx[2] + r * vh[2]) h.append(z * h[t] + (1 - z) * g) if zoneout_prob: if training: h[-1] = (h[-1] - h[-2]) * zoneout_mask[t] + h[-2] else: h[-1] = zoneout_prob * h[-2] + (1 - zoneout_prob) * h[-1] h = torch.stack(h) return h class LayerNormGRUFunction(torch.autograd.Function): @staticmethod def forward(ctx, training, zoneout_prob, *inputs): output = LIB.layer_norm_gru_forward(training, zoneout_prob, *inputs) ctx.save_for_backward(inputs[0], *inputs[2:], *output) ctx.mark_non_differentiable(inputs[-1]) ctx.training = training return output[0] @staticmethod def backward(ctx, grad_h): if not ctx.training: raise RuntimeError('LayerNormGRU backward can only be called in training mode') saved = [*ctx.saved_tensors] saved[0] = saved[0].permute(2, 0, 1).contiguous() saved[1] = saved[1].permute(1, 0).contiguous() saved[2] = saved[2].permute(1, 0).contiguous() grads = LIB.layer_norm_gru_backward(*saved, grad_h.contiguous()) return (None, None, *grads, None) class LayerNormGRU(BaseRNN): """ Layer Normalized Gated Recurrent Unit layer. This GRU layer applies layer normalization to the input and recurrent output activations of a standard GRU. The implementation is fused and GPU-accelerated. There are two commonly-used variants of GRU cells. This one implements 1406.1078v1 which applies the reset gate to the hidden state after matrix multiplication. The other variant, 1406.1078v3, applies the reset gate before matrix multiplication and is currently unsupported. This layer has built-in support for DropConnect and Zoneout, which are both techniques used to regularize RNNs. See [\_\_init\_\_](#__init__) and [forward](#forward) for usage. """ def __init__(self, input_size, hidden_size, batch_first=False, dropout=0.0, zoneout=0.0, return_state_sequence=False): """ Initialize the parameters of the GRU layer. Arguments: input_size: int, the feature dimension of the input. hidden_size: int, the feature dimension of the output. batch_first: (optional) bool, if `True`, then the input and output tensors are provided as `(batch, seq, feature)`. dropout: (optional) float, sets the dropout rate for DropConnect regularization on the recurrent matrix. zoneout: (optional) float, sets the zoneout rate for Zoneout regularization. return_state_sequence: (optional) bool, if `True`, the forward pass will return the entire state sequence instead of just the final state. Note that if the input is a padded sequence, the returned state will also be a padded sequence. Variables: kernel: the input projection weight matrix. Dimensions (input_size, hidden_size * 3) with `z,r,h` gate layout. Initialized with Xavier uniform initialization. recurrent_kernel: the recurrent projection weight matrix. Dimensions (hidden_size, hidden_size * 3) with `z,r,h` gate layout. Initialized with orthogonal initialization. bias: the input projection bias vector. Dimensions (hidden_size * 3) with `z,r,h` gate layout. Initialized to zeros. recurrent_bias: the recurrent projection bias vector. Dimensions (hidden_size * 3) with `z,r,h` gate layout. Initialized to zeros. gamma: the input and recurrent normalization gain. Dimensions (2, hidden_size * 3) with `gamma[0]` specifying the input gain and `gamma[1]` specifying the recurrent gain. Initialized to ones. """ super().__init__(input_size, hidden_size, batch_first, zoneout, return_state_sequence) if dropout < 0 or dropout > 1: raise ValueError('LayerNormGRU: dropout must be in [0.0, 1.0]') if zoneout < 0 or zoneout > 1: raise ValueError('LayerNormGRU: zoneout must be in [0.0, 1.0]') self.dropout = dropout self.kernel = nn.Parameter(torch.empty(input_size, hidden_size * 3)) self.recurrent_kernel = nn.Parameter(torch.empty(hidden_size, hidden_size * 3)) self.bias = nn.Parameter(torch.empty(hidden_size * 3)) self.recurrent_bias = nn.Parameter(torch.empty(hidden_size * 3)) self.gamma = nn.Parameter(torch.empty(2, hidden_size * 3)) self.reset_parameters() def reset_parameters(self): """Resets this layer's parameters to their initial values.""" hidden_size = self.hidden_size for i in range(3): nn.init.xavier_uniform_(self.kernel[:, i*hidden_size:(i+1)*hidden_size]) nn.init.orthogonal_(self.recurrent_kernel[:, i*hidden_size:(i+1)*hidden_size]) nn.init.zeros_(self.bias) nn.init.zeros_(self.recurrent_bias) nn.init.ones_(self.gamma) def forward(self, input, state=None, lengths=None): """ Runs a forward pass of the GRU layer. Arguments: input: Tensor, a batch of input sequences to pass through the GRU. Dimensions (seq_len, batch_size, input_size) if `batch_first` is `False`, otherwise (batch_size, seq_len, input_size). state: (optional) Tensor, the intial state for each batch element in `input`. Dimensions (1, batch_size, hidden_size). Defaults to zeros. lengths: (optional) Tensor, list of sequence lengths for each batch element. Dimension (batch_size). This argument may be omitted if all batch elements are unpadded and have the same sequence length. Returns: output: Tensor, the output of the GRU layer. Dimensions (seq_len, batch_size, hidden_size) if `batch_first` is `False` (default) or (batch_size, seq_len, hidden_size) if `batch_first` is `True`. Note that if `lengths` was specified, the `output` tensor will not be masked. It's the caller's responsibility to either not use the invalid entries or to mask them out before using them. h_n: the hidden state for the last sequence item. Dimensions (1, batch_size, hidden_size). """ input = self._permute(input) state_shape = [1, input.shape[1], self.hidden_size] h0 = self._get_state(input, state, state_shape) h = self._impl(input, h0[0], self._get_zoneout_mask(input)) state = self._get_final_state(h, lengths) output = self._permute(h[1:]) return output, state def _impl(self, input, state, zoneout_mask): if self._is_cuda(): return LayerNormGRUFunction.apply( self.training, self.zoneout, input.contiguous(), state.contiguous(), self.kernel.contiguous(), F.dropout(self.recurrent_kernel, self.dropout, self.training).contiguous(), self.bias.contiguous(), self.recurrent_bias.contiguous(), self.gamma.contiguous(), zoneout_mask.contiguous()) else: return LayerNormGRUScript( self.training, self.zoneout, input.contiguous(), state.contiguous(), self.kernel.contiguous(), F.dropout(self.recurrent_kernel, self.dropout, self.training).contiguous(), self.bias.contiguous(), self.recurrent_bias.contiguous(), self.gamma.contiguous(), zoneout_mask.contiguous()) ================================================ FILE: frameworks/pytorch/layer_norm_indrnn.cc ================================================ // Copyright 2020 LMNT, Inc. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // ============================================================================== #include #include #include #include #include "haste.h" #include "support.h" namespace { namespace layer_norm = haste::v0::layer_norm; namespace layer_norm_indrnn = haste::v0::layer_norm_indrnn; using torch::Tensor; std::vector layer_norm_indrnn_forward( bool training, float zoneout_prob, Tensor x, Tensor h0, Tensor kernel, Tensor recurrent_scale, Tensor bias, Tensor gamma, Tensor zoneout_mask) { const auto time_steps = x.size(0); const auto batch_size = x.size(1); const auto input_size = x.size(2); const auto hidden_size = recurrent_scale.size(0); const bool has_zoneout = zoneout_prob && zoneout_mask.size(0); CHECK_INPUT(x); CHECK_INPUT(h0); CHECK_INPUT(kernel); CHECK_INPUT(recurrent_scale); CHECK_INPUT(bias); CHECK_INPUT(gamma); CHECK_INPUT(zoneout_mask); const auto options = x.options(); const at::cuda::CUDAGuard guard(options.device_index()); Tensor output = torch::empty({ time_steps + 1, batch_size, hidden_size }, options); Tensor workspace = torch::empty({ time_steps, batch_size, hidden_size }, options); Tensor act_Wx = torch::empty({ time_steps, batch_size, hidden_size }, options); Tensor act_Wx_norm_cache = torch::empty({ time_steps, batch_size, 2 }, options); output[0] = h0; AT_DISPATCH_FLOATING_TYPES(x.scalar_type(), "layer_norm_indrnn_forward", ([&] { auto gamma_a = gamma.packed_accessor32(); layer_norm::ForwardPass layer_norm1( time_steps * batch_size, hidden_size, gamma_a[0].data(), nullptr, act_Wx_norm_cache.data_ptr()); layer_norm_indrnn::ForwardPass forward( training, batch_size, input_size, hidden_size, at::cuda::getCurrentCUDABlasHandle(), at::cuda::getCurrentCUDAStream()); forward.Run( time_steps, kernel.data_ptr(), recurrent_scale.data_ptr(), bias.data_ptr(), x.data_ptr(), output.data_ptr(), workspace.data_ptr(), act_Wx.data_ptr(), layer_norm1, has_zoneout ? zoneout_prob : 0.0f, has_zoneout ? zoneout_mask.data_ptr() : nullptr); })); return { output, act_Wx, act_Wx_norm_cache }; } std::vector layer_norm_indrnn_backward( Tensor x_t, Tensor kernel_t, Tensor recurrent_scale, Tensor bias, Tensor gamma, Tensor zoneout_mask, Tensor h, Tensor act_Wx, Tensor act_Wx_norm_cache, Tensor dh_new) { const auto input_size = x_t.size(0); const auto time_steps = x_t.size(1); const auto batch_size = x_t.size(2); const auto hidden_size = recurrent_scale.size(0); const bool has_zoneout = !!zoneout_mask.size(0); CHECK_INPUT(x_t); CHECK_INPUT(kernel_t); CHECK_INPUT(recurrent_scale); CHECK_INPUT(bias); CHECK_INPUT(gamma); CHECK_INPUT(zoneout_mask); CHECK_INPUT(h); CHECK_INPUT(dh_new); const auto options = x_t.options(); const at::cuda::CUDAGuard guard(options.device_index()); Tensor dx = torch::empty({ time_steps, batch_size, input_size }, options); Tensor dW = torch::zeros({ input_size, hidden_size }, options); Tensor du = torch::zeros({ hidden_size }, options); Tensor db = torch::zeros_like(bias); Tensor dh = torch::zeros({ batch_size, hidden_size }, options); Tensor workspace = torch::empty({ time_steps, batch_size, hidden_size }, options); Tensor dgamma = torch::zeros_like(gamma); AT_DISPATCH_FLOATING_TYPES(x_t.scalar_type(), "layer_norm_indrnn_backward", ([&] { auto gamma_a = gamma.packed_accessor32(); auto dgamma_a = dgamma.packed_accessor32(); layer_norm::BackwardPass layer_norm1( time_steps * batch_size, hidden_size, gamma_a[0].data(), nullptr, act_Wx.data_ptr(), dgamma_a[0].data(), nullptr, act_Wx_norm_cache.data_ptr()); layer_norm_indrnn::BackwardPass backward( batch_size, input_size, hidden_size, at::cuda::getCurrentCUDABlasHandle(), at::cuda::getCurrentCUDAStream()); backward.Run( time_steps, kernel_t.data_ptr(), recurrent_scale.data_ptr(), bias.data_ptr(), x_t.data_ptr(), h.data_ptr(), dh_new.data_ptr(), dx.data_ptr(), dW.data_ptr(), du.data_ptr(), db.data_ptr(), dh.data_ptr(), workspace.data_ptr(), layer_norm1, has_zoneout ? zoneout_mask.data_ptr() : nullptr); })); return { dx, dh, dW, du, db, dgamma }; } } // anonymous namespace void layer_norm_indrnn_init(py::module& m) { m.def("layer_norm_indrnn_forward", &layer_norm_indrnn_forward, "LayerNormIndRNN forward", py::call_guard()); m.def("layer_norm_indrnn_backward", &layer_norm_indrnn_backward, "LayerNormIndRNN backward", py::call_guard()); } ================================================ FILE: frameworks/pytorch/layer_norm_indrnn.py ================================================ # Copyright 2020 LMNT, Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== """Layer Normalized Independently Recurrent Neural Network""" import haste_pytorch_lib as LIB import torch import torch.nn as nn import torch.nn.functional as F from .base_rnn import BaseRNN __all__ = [ 'LayerNormIndRNN' ] #@torch.jit.script def LayerNormIndRNNScript( training: bool, zoneout_prob: float, input, h0, kernel, recurrent_scale, bias, gamma, zoneout_mask): time_steps = input.shape[0] hidden_size = kernel.shape[1] h = [h0] Wx = F.layer_norm(input @ kernel, (hidden_size,), weight=gamma[0]) + bias for t in range(time_steps): h.append(torch.tanh(Wx[t] + h[-1] * recurrent_scale)) if zoneout_prob: if training: h[-1] = (h[-1] - h[-2]) * zoneout_mask[t] + h[-2] else: h[-1] = zoneout_prob * h[-2] + (1 - zoneout_prob) * h[-1] h = torch.stack(h) return h class LayerNormIndRNNFunction(torch.autograd.Function): @staticmethod def forward(ctx, training, zoneout_prob, *inputs): output = LIB.layer_norm_indrnn_forward(training, zoneout_prob, *inputs) ctx.save_for_backward(inputs[0], *inputs[2:], *output) ctx.training = training return output[0] @staticmethod def backward(ctx, grad_h): if not ctx.training: raise RuntimeError('LayerNormIndRNN backward can only be called in training mode') saved = [*ctx.saved_tensors] saved[0] = saved[0].permute(2, 0, 1).contiguous() saved[1] = saved[1].permute(1, 0).contiguous() grads = LIB.layer_norm_indrnn_backward(*saved, grad_h.contiguous()) return (None, None, *grads, None) class LayerNormIndRNN(BaseRNN): """ Layer Normalized Independently Recurrent Neural Network layer. This IndRNN layer applies layer normalization to the input activations of a standard IndRNN. The implementation is fused and GPU-accelerated. This layer has built-in support for Zoneout regularization. See [\_\_init\_\_](#__init__) and [forward](#forward) for usage. """ def __init__( self, input_size, hidden_size, batch_first=False, zoneout=0.0, return_state_sequence=False): """ Initialize the parameters of the IndRNN layer. Arguments: input_size: int, the feature dimension of the input. hidden_size: int, the feature dimension of the output. batch_first: (optional) bool, if `True`, then the input and output tensors are provided as `(batch, seq, feature)`. zoneout: (optional) float, sets the zoneout rate for Zoneout regularization. return_state_sequence: (optional) bool, if `True`, the forward pass will return the entire state sequence instead of just the final state. Note that if the input is a padded sequence, the returned state will also be a padded sequence. Variables: kernel: the input projection weight matrix. Dimensions (input_size, hidden_size). Initialized with Xavier uniform initialization. recurrent_scale: the recurrent scale weight vector. Dimensions (hidden_size). Initialized uniformly in [-0.5, 0.5]. Note that this initialization scheme is different than in the original authors' implementation. See https://github.com/lmnt-com/haste/issues/7 for details. bias: the RNN bias vector. Dimensions (hidden_size). Initialized to zeros. gamma: the input and recurrent normalization gain. Dimensions (2, hidden_size) with `gamma[0]` specifying the input gain and `gamma[1]` specifying the recurrent gain. Initialized to ones. """ super().__init__(input_size, hidden_size, batch_first, zoneout, return_state_sequence) if zoneout < 0 or zoneout > 1: raise ValueError('LayerNormIndRNN: zoneout must be in [0.0, 1.0]') self.input_size = input_size self.hidden_size = hidden_size self.batch_first = batch_first self.zoneout = zoneout self.kernel = nn.Parameter(torch.empty(input_size, hidden_size)) self.recurrent_scale = nn.Parameter(torch.empty(hidden_size)) self.bias = nn.Parameter(torch.empty(hidden_size)) self.gamma = nn.Parameter(torch.empty(2, hidden_size)) self.reset_parameters() def reset_parameters(self): """Resets this layer's parameters to their initial values.""" nn.init.xavier_uniform_(self.kernel) nn.init.uniform_(self.recurrent_scale, -0.5, 0.5) nn.init.zeros_(self.bias) nn.init.ones_(self.gamma) def forward(self, input, state=None, lengths=None): """ Runs a forward pass of the IndRNN layer. Arguments: input: Tensor, a batch of input sequences to pass through the GRU. Dimensions (seq_len, batch_size, input_size) if `batch_first` is `False`, otherwise (batch_size, seq_len, input_size). state: (optional) Tensor, the initial state for each batch element in `input`. Dimensions (1, batch_size, hidden_size). Defaults to zeros. lengths: (optional) Tensor, list of sequence lengths for each batch element. Dimension (batch_size). This argument may be omitted if all batch elements are unpadded and have the same sequence length. Returns: output: Tensor, the output of the GRU layer. Dimensions (seq_len, batch_size, hidden_size) if `batch_first` is `False` (default) or (batch_size, seq_len, hidden_size) if `batch_first` is `True`. Note that if `lengths` was specified, the `output` tensor will not be masked. It's the caller's responsibility to either not use the invalid entries or to mask them out before using them. state: the hidden state for the last sequence item. Dimensions (1, batch_size, hidden_size). """ input = self._permute(input) state_shape = [1, input.shape[1], self.hidden_size] h0 = self._get_state(input, state, state_shape) h = self._impl(input, h0[0], self._get_zoneout_mask(input)) state = self._get_final_state(h, lengths) output = self._permute(h[1:]) return output, state def _impl(self, input, state, zoneout_mask): if self._is_cuda(): return LayerNormIndRNNFunction.apply( self.training, self.zoneout, input.contiguous(), state.contiguous(), self.kernel.contiguous(), self.recurrent_scale.contiguous(), self.bias.contiguous(), self.gamma.contiguous(), zoneout_mask.contiguous()) else: return LayerNormIndRNNScript( self.training, self.zoneout, input.contiguous(), state.contiguous(), self.kernel.contiguous(), self.recurrent_scale.contiguous(), self.bias.contiguous(), self.gamma.contiguous(), zoneout_mask.contiguous()) ================================================ FILE: frameworks/pytorch/layer_norm_lstm.cc ================================================ // Copyright 2020 LMNT, Inc. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // ============================================================================== #include #include #include #include #include "haste.h" #include "support.h" namespace { namespace layer_norm = haste::v0::layer_norm; namespace layer_norm_lstm = haste::v0::layer_norm_lstm; using torch::Tensor; std::vector layer_norm_lstm_forward( bool training, float zoneout_prob, Tensor x, Tensor h0, Tensor c0, Tensor kernel, Tensor recurrent_kernel, Tensor bias, Tensor gamma, Tensor gamma_h, Tensor beta_h, Tensor zoneout_mask) { const auto time_steps = x.size(0); const auto batch_size = x.size(1); const auto input_size = x.size(2); const auto hidden_size = recurrent_kernel.size(0); const bool has_zoneout = zoneout_prob && zoneout_mask.size(0); CHECK_INPUT(x); CHECK_INPUT(h0); CHECK_INPUT(c0); CHECK_INPUT(kernel); CHECK_INPUT(recurrent_kernel); CHECK_INPUT(bias); CHECK_INPUT(gamma); CHECK_INPUT(gamma_h); CHECK_INPUT(beta_h); CHECK_INPUT(zoneout_mask); const auto options = x.options(); const at::cuda::CUDAGuard guard(options.device_index()); Tensor output = torch::zeros({ time_steps + 1, batch_size, hidden_size }, options); Tensor output_state = torch::zeros({ time_steps + 1, batch_size, hidden_size }, options); Tensor act_Wx = torch::empty({ time_steps, batch_size, hidden_size * 4 }, options); Tensor act_Wx_norm = torch::empty({ time_steps, batch_size, hidden_size * 4 }, options); Tensor act_Wx_norm_cache = torch::empty({ time_steps, batch_size, 2 }, options); Tensor act_Rh = torch::empty({ time_steps, batch_size, hidden_size * 4 }, options); Tensor act_Rh_norm_cache = torch::empty({ time_steps, batch_size, 2 }, options); Tensor act_c_norm = torch::empty({ time_steps, batch_size, hidden_size }, options); Tensor act_c_norm_cache = torch::empty({ time_steps, batch_size, 2 }, options); Tensor tmp_Rh = torch::empty({ batch_size, hidden_size * 4 }, options); output[0] = h0; output_state[0] = c0; AT_DISPATCH_FLOATING_TYPES(x.scalar_type(), "layer_norm_lstm_forward", ([&] { auto gamma_a = gamma.packed_accessor32(); layer_norm::ForwardPass layer_norm1( time_steps * batch_size, hidden_size * 4, gamma_a[0].data(), nullptr, act_Wx_norm_cache.data_ptr()); layer_norm::ForwardPass layer_norm2( time_steps * batch_size, hidden_size * 4, gamma_a[1].data(), nullptr, act_Rh_norm_cache.data_ptr()); layer_norm::ForwardPass layer_norm3( time_steps * batch_size, hidden_size, gamma_h.data_ptr(), beta_h.data_ptr(), act_c_norm_cache.data_ptr()); layer_norm_lstm::ForwardPass lstm( training, batch_size, input_size, hidden_size, at::cuda::getCurrentCUDABlasHandle(), at::cuda::getCurrentCUDAStream()); lstm.Run( time_steps, kernel.data_ptr(), recurrent_kernel.data_ptr(), bias.data_ptr(), x.data_ptr(), output.data_ptr(), output_state.data_ptr(), act_Wx.data_ptr(), tmp_Rh.data_ptr(), layer_norm1, act_Wx_norm.data_ptr(), act_Rh.data_ptr(), layer_norm2, layer_norm3, act_c_norm.data_ptr(), has_zoneout ? zoneout_prob : 0.0f, has_zoneout ? zoneout_mask.data_ptr() : nullptr); })); return { output, output_state, act_Wx, act_Wx_norm, act_Wx_norm_cache, act_Rh, act_Rh_norm_cache, act_c_norm, act_c_norm_cache }; } std::vector layer_norm_lstm_backward( Tensor x_t, Tensor kernel_t, Tensor recurrent_kernel_t, Tensor bias, Tensor gamma, Tensor gamma_h, Tensor beta_h, Tensor zoneout_mask, Tensor h, Tensor c, Tensor act_Wx, Tensor act_Wx_norm, Tensor act_Wx_norm_cache, Tensor act_Rh, Tensor act_Rh_norm_cache, Tensor act_c_norm, Tensor act_c_norm_cache, Tensor dh_new, Tensor dc_new) { const auto input_size = x_t.size(0); const auto time_steps = x_t.size(1); const auto batch_size = x_t.size(2); const auto hidden_size = recurrent_kernel_t.size(1); const bool has_zoneout = !!zoneout_mask.size(0); CHECK_INPUT(x_t); CHECK_INPUT(kernel_t); CHECK_INPUT(recurrent_kernel_t); CHECK_INPUT(bias); CHECK_INPUT(gamma); CHECK_INPUT(gamma_h); CHECK_INPUT(beta_h); CHECK_INPUT(zoneout_mask); CHECK_INPUT(h); CHECK_INPUT(c); CHECK_INPUT(act_Wx); CHECK_INPUT(act_Wx_norm); CHECK_INPUT(act_Wx_norm_cache); CHECK_INPUT(act_Rh); CHECK_INPUT(act_Rh_norm_cache); CHECK_INPUT(act_c_norm); CHECK_INPUT(act_c_norm_cache); CHECK_INPUT(dh_new); CHECK_INPUT(dc_new); const auto options = x_t.options(); const at::cuda::CUDAGuard guard(options.device_index()); Tensor dx = torch::empty({ time_steps, batch_size, input_size }, options); Tensor dW = torch::zeros({ input_size, hidden_size * 4 }, options); Tensor dR = torch::zeros({ hidden_size, hidden_size * 4 }, options); Tensor db = torch::zeros({ hidden_size * 4 }, options); Tensor dgamma = torch::zeros_like(gamma); Tensor dgamma_h = torch::zeros_like(gamma_h); Tensor dbeta_h = torch::zeros_like(beta_h); Tensor dh = torch::zeros({ batch_size, hidden_size }, options); Tensor dc = torch::zeros({ batch_size, hidden_size }, options); AT_DISPATCH_FLOATING_TYPES(x_t.scalar_type(), "layer_norm_lstm_backward", ([&] { auto gamma_a = gamma.packed_accessor32(); auto dgamma_a = dgamma.packed_accessor32(); auto c_a = c.packed_accessor32(); layer_norm::BackwardPass layer_norm1( time_steps * batch_size, hidden_size * 4, gamma_a[0].data(), nullptr, act_Wx.data_ptr(), dgamma_a[0].data(), nullptr, act_Wx_norm_cache.data_ptr()); layer_norm::BackwardPass layer_norm2( time_steps * batch_size, hidden_size * 4, gamma_a[1].data(), nullptr, act_Rh.data_ptr(), dgamma_a[1].data(), nullptr, act_Rh_norm_cache.data_ptr()); layer_norm::BackwardPass layer_norm3( time_steps * batch_size, hidden_size, gamma_h.data_ptr(), beta_h.data_ptr(), c_a[1].data(), dgamma_h.data_ptr(), dbeta_h.data_ptr(), act_c_norm_cache.data_ptr()); layer_norm_lstm::BackwardPass lstm( batch_size, input_size, hidden_size, at::cuda::getCurrentCUDABlasHandle(), at::cuda::getCurrentCUDAStream()); lstm.Run( time_steps, kernel_t.data_ptr(), recurrent_kernel_t.data_ptr(), bias.data_ptr(), x_t.data_ptr(), h.data_ptr(), c.data_ptr(), dh_new.data_ptr(), dc_new.data_ptr(), dx.data_ptr(), dW.data_ptr(), dR.data_ptr(), db.data_ptr(), dh.data_ptr(), dc.data_ptr(), act_Wx.data_ptr(), layer_norm1, act_Wx_norm.data_ptr(), act_Rh.data_ptr(), layer_norm2, layer_norm3, act_c_norm.data_ptr(), has_zoneout ? zoneout_mask.data_ptr() : nullptr); })); return { dx, dh, dc, dW, dR, db, dgamma, dgamma_h, dbeta_h }; } } // anonymous namespace void layer_norm_lstm_init(py::module& m) { m.def("layer_norm_lstm_forward", &layer_norm_lstm_forward, "LayerNormLSTM forward", py::call_guard()); m.def("layer_norm_lstm_backward", &layer_norm_lstm_backward, "LayerNormLSTM backward", py::call_guard()); } ================================================ FILE: frameworks/pytorch/layer_norm_lstm.py ================================================ # Copyright 2020 LMNT, Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== """Layer Normalized Long Short-Term Memory""" import haste_pytorch_lib as LIB import torch import torch.nn as nn import torch.nn.functional as F from .base_rnn import BaseRNN __all__ = [ 'LayerNormLSTM' ] #@torch.jit.script def LayerNormLSTMScript( training: bool, zoneout_prob: float, input, h0, c0, kernel, recurrent_kernel, bias, gamma, gamma_h, beta_h, zoneout_mask): time_steps = input.shape[0] batch_size = input.shape[1] hidden_size = recurrent_kernel.shape[0] h = [h0] c = [c0] Wx = F.layer_norm(input @ kernel, (hidden_size * 4,), weight=gamma[0]) for t in range(time_steps): v = F.layer_norm(h[t] @ recurrent_kernel, (hidden_size * 4,), weight=gamma[1]) + Wx[t] + bias i, g, f, o = torch.chunk(v, 4, 1) i = torch.sigmoid(i) g = torch.tanh(g) f = torch.sigmoid(f) o = torch.sigmoid(o) c.append(f * c[t] + i * g) h.append(o * torch.tanh(F.layer_norm(c[-1], (hidden_size,), weight=gamma_h, bias=beta_h))) if zoneout_prob: if training: h[-1] = (h[-1] - h[-2]) * zoneout_mask[t] + h[-2] else: h[-1] = zoneout_prob * h[-2] + (1 - zoneout_prob) * h[-1] h = torch.stack(h) c = torch.stack(c) return h, c class LayerNormLSTMFunction(torch.autograd.Function): @staticmethod def forward(ctx, training, zoneout_prob, *inputs): outputs = LIB.layer_norm_lstm_forward(training, zoneout_prob, *inputs) ctx.save_for_backward(inputs[0], *inputs[3:], *outputs) ctx.mark_non_differentiable(inputs[-1]) # zoneout mask is non-differentiable ctx.training = training return outputs[0], outputs[1] @staticmethod def backward(ctx, grad_h, grad_c): if not ctx.training: raise RuntimeError('LayerNormLSTM backward can only be called in training mode') saved = [*ctx.saved_tensors] saved[0] = saved[0].permute(2, 0, 1).contiguous() # x -> x_t saved[1] = saved[1].permute(1, 0).contiguous() # kernel -> kernel_t saved[2] = saved[2].permute(1, 0).contiguous() # recurrent_kernel -> recurrent_kernel_t grads = LIB.layer_norm_lstm_backward(*saved, grad_h.contiguous(), grad_c.contiguous()) return (None, None, *grads, None) class LayerNormLSTM(BaseRNN): """ Layer Normalized Long Short-Term Memory layer. This LSTM layer applies layer normalization to the input, recurrent, and output activations of a standard LSTM. The implementation is fused and GPU-accelerated. DropConnect and Zoneout regularization are built-in, and this layer allows setting a non-zero initial forget gate bias. Details about the exact function this layer implements can be found at https://github.com/lmnt-com/haste/issues/1. See [\_\_init\_\_](#__init__) and [forward](#forward) for usage. """ def __init__(self, input_size, hidden_size, batch_first=False, forget_bias=1.0, dropout=0.0, zoneout=0.0, return_state_sequence=False): """ Initialize the parameters of the LSTM layer. Arguments: input_size: int, the feature dimension of the input. hidden_size: int, the feature dimension of the output. batch_first: (optional) bool, if `True`, then the input and output tensors are provided as `(batch, seq, feature)`. forget_bias: (optional) float, sets the initial bias of the forget gate for this LSTM cell. dropout: (optional) float, sets the dropout rate for DropConnect regularization on the recurrent matrix. zoneout: (optional) float, sets the zoneout rate for Zoneout regularization. return_state_sequence: (optional) bool, if `True`, the forward pass will return the entire state sequence instead of just the final state. Note that if the input is a padded sequence, the returned state will also be a padded sequence. Variables: kernel: the input projection weight matrix. Dimensions (input_size, hidden_size * 4) with `i,g,f,o` gate layout. Initialized with Xavier uniform initialization. recurrent_kernel: the recurrent projection weight matrix. Dimensions (hidden_size, hidden_size * 4) with `i,g,f,o` gate layout. Initialized with orthogonal initialization. bias: the projection bias vector. Dimensions (hidden_size * 4) with `i,g,f,o` gate layout. The forget gate biases are initialized to `forget_bias` and the rest are zeros. gamma: the input and recurrent normalization gain. Dimensions (2, hidden_size * 4) with `gamma[0]` specifying the input gain and `gamma[1]` specifying the recurrent gain. Initialized to ones. gamma_h: the output normalization gain. Dimensions (hidden_size). Initialized to ones. beta_h: the output normalization bias. Dimensions (hidden_size). Initialized to zeros. """ super().__init__(input_size, hidden_size, batch_first, zoneout, return_state_sequence) if dropout < 0 or dropout > 1: raise ValueError('LayerNormLSTM: dropout must be in [0.0, 1.0]') if zoneout < 0 or zoneout > 1: raise ValueError('LayerNormLSTM: zoneout must be in [0.0, 1.0]') self.forget_bias = forget_bias self.dropout = dropout self.kernel = nn.Parameter(torch.empty(input_size, hidden_size * 4)) self.recurrent_kernel = nn.Parameter(torch.empty(hidden_size, hidden_size * 4)) self.bias = nn.Parameter(torch.empty(hidden_size * 4)) self.gamma = nn.Parameter(torch.empty(2, hidden_size * 4)) self.gamma_h = nn.Parameter(torch.empty(hidden_size)) self.beta_h = nn.Parameter(torch.empty(hidden_size)) self.reset_parameters() def reset_parameters(self): """Resets this layer's parameters to their initial values.""" hidden_size = self.hidden_size for i in range(4): nn.init.xavier_uniform_(self.kernel[:, i*hidden_size:(i+1)*hidden_size]) nn.init.orthogonal_(self.recurrent_kernel[:, i*hidden_size:(i+1)*hidden_size]) nn.init.zeros_(self.bias) nn.init.constant_(self.bias[hidden_size*2:hidden_size*3], self.forget_bias) nn.init.ones_(self.gamma) nn.init.ones_(self.gamma_h) nn.init.zeros_(self.beta_h) def forward(self, input, state=None, lengths=None): """ Runs a forward pass of the LSTM layer. Arguments: input: Tensor, a batch of input sequences to pass through the LSTM. Dimensions (seq_len, batch_size, input_size) if `batch_first` is `False`, otherwise (batch_size, seq_len, input_size). lengths: (optional) Tensor, list of sequence lengths for each batch element. Dimension (batch_size). This argument may be omitted if all batch elements are unpadded and have the same sequence length. Returns: output: Tensor, the output of the LSTM layer. Dimensions (seq_len, batch_size, hidden_size) if `batch_first` is `False` (default) or (batch_size, seq_len, hidden_size) if `batch_first` is `True`. Note that if `lengths` was specified, the `output` tensor will not be masked. It's the caller's responsibility to either not use the invalid entries or to mask them out before using them. (h_n, c_n): the hidden and cell states, respectively, for the last sequence item. Dimensions (1, batch_size, hidden_size). """ input = self._permute(input) state_shape = [1, input.shape[1], self.hidden_size] state_shape = (state_shape, state_shape) h0, c0 = self._get_state(input, state, state_shape) h, c = self._impl(input, (h0[0], c0[0]), self._get_zoneout_mask(input)) state = self._get_final_state((h, c), lengths) output = self._permute(h[1:]) return output, state def _impl(self, input, state, zoneout_mask): if self._is_cuda(): return LayerNormLSTMFunction.apply( self.training, self.zoneout, input.contiguous(), state[0].contiguous(), state[1].contiguous(), self.kernel.contiguous(), F.dropout(self.recurrent_kernel, self.dropout, self.training).contiguous(), self.bias.contiguous(), self.gamma.contiguous(), self.gamma_h.contiguous(), self.beta_h.contiguous(), zoneout_mask.contiguous()) else: return LayerNormLSTMScript( self.training, self.zoneout, input.contiguous(), state[0].contiguous(), state[1].contiguous(), self.kernel.contiguous(), F.dropout(self.recurrent_kernel, self.dropout, self.training).contiguous(), self.bias.contiguous(), self.gamma.contiguous(), self.gamma_h.contiguous(), self.beta_h.contiguous(), zoneout_mask.contiguous()) ================================================ FILE: frameworks/pytorch/lstm.cc ================================================ // Copyright 2020 LMNT, Inc. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // ============================================================================== #include #include #include #include #include "haste.h" #include "support.h" namespace { using haste::v0::lstm::ForwardPass; using haste::v0::lstm::BackwardPass; using torch::Tensor; std::vector lstm_forward( bool training, float zoneout_prob, Tensor x, Tensor h0, Tensor c0, Tensor kernel, Tensor recurrent_kernel, Tensor bias, Tensor zoneout_mask) { const auto time_steps = x.size(0); const auto batch_size = x.size(1); const auto input_size = x.size(2); const auto hidden_size = recurrent_kernel.size(0); const bool has_zoneout = zoneout_prob && zoneout_mask.size(0); CHECK_INPUT(x); CHECK_INPUT(h0); CHECK_INPUT(c0); CHECK_INPUT(kernel); CHECK_INPUT(recurrent_kernel); CHECK_INPUT(bias); CHECK_INPUT(zoneout_mask); const auto options = x.options(); const at::cuda::CUDAGuard guard(options.device_index()); Tensor output = torch::empty({ time_steps + 1, batch_size, hidden_size }, options); Tensor output_state = torch::empty({ time_steps + 1, batch_size, hidden_size }, options); Tensor cache = torch::empty({ time_steps, batch_size, hidden_size * 4 }, options); Tensor tmp_Rh = torch::empty({ batch_size, hidden_size * 4 }, options); output[0] = h0; output_state[0] = c0; AT_DISPATCH_FLOATING_TYPES(x.scalar_type(), "lstm_forward", ([&] { ForwardPass forward( training, batch_size, input_size, hidden_size, at::cuda::getCurrentCUDABlasHandle(), at::cuda::getCurrentCUDAStream()); forward.Run( time_steps, kernel.data_ptr(), recurrent_kernel.data_ptr(), bias.data_ptr(), x.data_ptr(), output.data_ptr(), output_state.data_ptr(), cache.data_ptr(), tmp_Rh.data_ptr(), has_zoneout ? zoneout_prob : 0.0f, has_zoneout ? zoneout_mask.data_ptr() : nullptr); })); return { output, output_state, cache }; } std::vector lstm_backward( Tensor x_t, Tensor kernel_t, Tensor recurrent_kernel_t, Tensor bias, Tensor zoneout_mask, Tensor h, Tensor c, Tensor cache, Tensor dh_new, Tensor dc_new) { const auto input_size = x_t.size(0); const auto time_steps = x_t.size(1); const auto batch_size = x_t.size(2); const auto hidden_size = recurrent_kernel_t.size(1); const bool has_zoneout = !!zoneout_mask.size(0); CHECK_INPUT(x_t); CHECK_INPUT(kernel_t); CHECK_INPUT(recurrent_kernel_t); CHECK_INPUT(bias); CHECK_INPUT(h); CHECK_INPUT(c); CHECK_INPUT(cache); CHECK_INPUT(dh_new); CHECK_INPUT(dc_new); CHECK_INPUT(zoneout_mask); const auto options = x_t.options(); const at::cuda::CUDAGuard guard(options.device_index()); Tensor dx = torch::empty({ time_steps, batch_size, input_size }, options); Tensor dW = torch::zeros({ input_size, hidden_size * 4 }, options); Tensor dR = torch::zeros({ hidden_size, hidden_size * 4 }, options); Tensor db = torch::zeros_like(bias); Tensor dh = torch::zeros({ batch_size, hidden_size }, options); Tensor dc = torch::zeros({ batch_size, hidden_size }, options); AT_DISPATCH_FLOATING_TYPES(x_t.scalar_type(), "lstm_backward", ([&] { BackwardPass backward( batch_size, input_size, hidden_size, at::cuda::getCurrentCUDABlasHandle(), at::cuda::getCurrentCUDAStream()); backward.Run( time_steps, kernel_t.data_ptr(), recurrent_kernel_t.data_ptr(), bias.data_ptr(), x_t.data_ptr(), h.data_ptr(), c.data_ptr(), dh_new.data_ptr(), dc_new.data_ptr(), dx.data_ptr(), dW.data_ptr(), dR.data_ptr(), db.data_ptr(), dh.data_ptr(), dc.data_ptr(), cache.data_ptr(), has_zoneout ? zoneout_mask.data_ptr() : nullptr); })); return { dx, dh, dc, dW, dR, db }; } } // anonymous namespace void lstm_init(py::module& m) { m.def("lstm_forward", &lstm_forward, "LSTM forward", py::call_guard()); m.def("lstm_backward", &lstm_backward, "LSTM backward", py::call_guard()); } ================================================ FILE: frameworks/pytorch/lstm.py ================================================ # Copyright 2020 LMNT, Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== """Long Short-Term Memory""" import haste_pytorch_lib as LIB import torch import torch.nn as nn import torch.nn.functional as F from .base_rnn import BaseRNN __all__ = [ 'LSTM' ] #@torch.jit.script def LSTMScript( training: bool, zoneout_prob: float, input, h0, c0, kernel, recurrent_kernel, bias, zoneout_mask): time_steps = input.shape[0] batch_size = input.shape[1] hidden_size = recurrent_kernel.shape[0] h = [h0] c = [c0] Wx = input @ kernel for t in range(time_steps): v = h[t] @ recurrent_kernel + Wx[t] + bias i, g, f, o = torch.chunk(v, 4, 1) i = torch.sigmoid(i) g = torch.tanh(g) f = torch.sigmoid(f) o = torch.sigmoid(o) c.append(f * c[t] + i * g) h.append(o * torch.tanh(c[-1])) if zoneout_prob: if training: h[-1] = (h[-1] - h[-2]) * zoneout_mask[t] + h[-2] else: h[-1] = zoneout_prob * h[-2] + (1 - zoneout_prob) * h[-1] h = torch.stack(h) c = torch.stack(c) return h, c class LSTMFunction(torch.autograd.Function): @staticmethod def forward(ctx, training, zoneout_prob, *inputs): h, c, cache = LIB.lstm_forward(training, zoneout_prob, *inputs) ctx.save_for_backward(inputs[0], *inputs[3:], h, c, cache) ctx.mark_non_differentiable(inputs[-1]) ctx.training = training return h, c @staticmethod def backward(ctx, grad_h, grad_c): if not ctx.training: raise RuntimeError('LSTM backward can only be called in training mode') saved = [*ctx.saved_tensors] saved[0] = saved[0].permute(2, 0, 1).contiguous() saved[1] = saved[1].permute(1, 0).contiguous() saved[2] = saved[2].permute(1, 0).contiguous() grads = LIB.lstm_backward(*saved, grad_h.contiguous(), grad_c.contiguous()) return (None, None, *grads, None) class LSTM(BaseRNN): """ Long Short-Term Memory layer. This LSTM layer offers a fused, GPU-accelerated PyTorch op for inference and training. Although this implementation is comparable in performance to cuDNN's LSTM, it offers additional options not typically found in other high-performance implementations. DropConnect and Zoneout regularization are built-in, and this layer allows setting a non-zero initial forget gate bias. See [\_\_init\_\_](#__init__) and [forward](#forward) for general usage. See [from_native_weights](#from_native_weights) and [to_native_weights](#to_native_weights) for compatibility with PyTorch LSTMs. """ def __init__(self, input_size, hidden_size, batch_first=False, forget_bias=1.0, dropout=0.0, zoneout=0.0, return_state_sequence=False): """ Initialize the parameters of the LSTM layer. Arguments: input_size: int, the feature dimension of the input. hidden_size: int, the feature dimension of the output. batch_first: (optional) bool, if `True`, then the input and output tensors are provided as `(batch, seq, feature)`. forget_bias: (optional) float, sets the initial bias of the forget gate for this LSTM cell. dropout: (optional) float, sets the dropout rate for DropConnect regularization on the recurrent matrix. zoneout: (optional) float, sets the zoneout rate for Zoneout regularization. return_state_sequence: (optional) bool, if `True`, the forward pass will return the entire state sequence instead of just the final state. Note that if the input is a padded sequence, the returned state will also be a padded sequence. Variables: kernel: the input projection weight matrix. Dimensions (input_size, hidden_size * 4) with `i,g,f,o` gate layout. Initialized with Xavier uniform initialization. recurrent_kernel: the recurrent projection weight matrix. Dimensions (hidden_size, hidden_size * 4) with `i,g,f,o` gate layout. Initialized with orthogonal initialization. bias: the projection bias vector. Dimensions (hidden_size * 4) with `i,g,f,o` gate layout. The forget gate biases are initialized to `forget_bias` and the rest are zeros. """ super().__init__(input_size, hidden_size, batch_first, zoneout, return_state_sequence) if dropout < 0 or dropout > 1: raise ValueError('LSTM: dropout must be in [0.0, 1.0]') if zoneout < 0 or zoneout > 1: raise ValueError('LSTM: zoneout must be in [0.0, 1.0]') self.forget_bias = forget_bias self.dropout = dropout self.kernel = nn.Parameter(torch.empty(input_size, hidden_size * 4)) self.recurrent_kernel = nn.Parameter(torch.empty(hidden_size, hidden_size * 4)) self.bias = nn.Parameter(torch.empty(hidden_size * 4)) self.reset_parameters() def to_native_weights(self): """ Converts Haste LSTM weights to native PyTorch LSTM weights. Returns: weight_ih_l0: Parameter, the input-hidden weights of the LSTM layer. weight_hh_l0: Parameter, the hidden-hidden weights of the LSTM layer. bias_ih_l0: Parameter, the input-hidden bias of the LSTM layer. bias_hh_l0: Parameter, the hidden-hidden bias of the LSTM layer. """ def reorder_weights(w): i, g, f, o = torch.chunk(w, 4, dim=-1) return torch.cat([i, f, g, o], dim=-1) kernel = reorder_weights(self.kernel).permute(1, 0).contiguous() recurrent_kernel = reorder_weights(self.recurrent_kernel).permute(1, 0).contiguous() half_bias = reorder_weights(self.bias) / 2.0 kernel = torch.nn.Parameter(kernel) recurrent_kernel = torch.nn.Parameter(recurrent_kernel) bias1 = torch.nn.Parameter(half_bias) bias2 = torch.nn.Parameter(half_bias.clone()) return kernel, recurrent_kernel, bias1, bias2 def from_native_weights(self, weight_ih_l0, weight_hh_l0, bias_ih_l0, bias_hh_l0): """ Copies and converts the provided PyTorch LSTM weights into this layer. Arguments: weight_ih_l0: Parameter, the input-hidden weights of the PyTorch LSTM layer. weight_hh_l0: Parameter, the hidden-hidden weights of the PyTorch LSTM layer. bias_ih_l0: Parameter, the input-hidden bias of the PyTorch LSTM layer. bias_hh_l0: Parameter, the hidden-hidden bias of the PyTorch LSTM layer. """ def reorder_weights(w): i, f, g, o = torch.chunk(w, 4, dim=-1) return torch.cat([i, g, f, o], dim=-1) kernel = reorder_weights(weight_ih_l0.permute(1, 0)).contiguous() recurrent_kernel = reorder_weights(weight_hh_l0.permute(1, 0)).contiguous() bias = reorder_weights(bias_ih_l0 + bias_hh_l0).contiguous() self.kernel = nn.Parameter(kernel) self.recurrent_kernel = nn.Parameter(recurrent_kernel) self.bias = nn.Parameter(bias) def reset_parameters(self): """Resets this layer's parameters to their initial values.""" hidden_size = self.hidden_size for i in range(4): nn.init.xavier_uniform_(self.kernel[:, i*hidden_size:(i+1)*hidden_size]) nn.init.orthogonal_(self.recurrent_kernel[:, i*hidden_size:(i+1)*hidden_size]) nn.init.zeros_(self.bias) nn.init.constant_(self.bias[hidden_size*2:hidden_size*3], self.forget_bias) def forward(self, input, state=None, lengths=None): """ Runs a forward pass of the LSTM layer. Arguments: input: Tensor, a batch of input sequences to pass through the LSTM. Dimensions (seq_len, batch_size, input_size) if `batch_first` is `False`, otherwise (batch_size, seq_len, input_size). lengths: (optional) Tensor, list of sequence lengths for each batch element. Dimension (batch_size). This argument may be omitted if all batch elements are unpadded and have the same sequence length. Returns: output: Tensor, the output of the LSTM layer. Dimensions (seq_len, batch_size, hidden_size) if `batch_first` is `False` (default) or (batch_size, seq_len, hidden_size) if `batch_first` is `True`. Note that if `lengths` was specified, the `output` tensor will not be masked. It's the caller's responsibility to either not use the invalid entries or to mask them out before using them. (h_n, c_n): the hidden and cell states, respectively, for the last sequence item. Dimensions (1, batch_size, hidden_size). """ input = self._permute(input) state_shape = [1, input.shape[1], self.hidden_size] state_shape = (state_shape, state_shape) h0, c0 = self._get_state(input, state, state_shape) h, c = self._impl(input, (h0[0], c0[0]), self._get_zoneout_mask(input)) state = self._get_final_state((h, c), lengths) output = self._permute(h[1:]) return output, state def _impl(self, input, state, zoneout_mask): if self._is_cuda(): return LSTMFunction.apply( self.training, self.zoneout, input.contiguous(), state[0].contiguous(), state[1].contiguous(), self.kernel.contiguous(), F.dropout(self.recurrent_kernel, self.dropout, self.training).contiguous(), self.bias.contiguous(), zoneout_mask.contiguous()) else: return LSTMScript( self.training, self.zoneout, input.contiguous(), state[0].contiguous(), state[1].contiguous(), self.kernel.contiguous(), F.dropout(self.recurrent_kernel, self.dropout, self.training).contiguous(), self.bias.contiguous(), zoneout_mask.contiguous()) ================================================ FILE: frameworks/pytorch/support.cc ================================================ // Copyright 2020 LMNT, Inc. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // ============================================================================== #include void gru_init(py::module&); void indrnn_init(py::module&); void lstm_init(py::module&); void layer_norm_gru_init(py::module&); void layer_norm_indrnn_init(py::module&); void layer_norm_lstm_init(py::module&); PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { gru_init(m); indrnn_init(m); lstm_init(m); layer_norm_gru_init(m); layer_norm_indrnn_init(m); layer_norm_lstm_init(m); } ================================================ FILE: frameworks/pytorch/support.h ================================================ // Copyright 2020 LMNT, Inc. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // ============================================================================== #pragma once #include #define CHECK_CUDA(x) TORCH_CHECK(x.options().device().is_cuda(), #x " must be a CUDA tensor") #define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous") #define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x) template struct native_type { using T = U; }; template<> struct native_type { using T = __half; }; template typename native_type::T* ptr(torch::Tensor t) { return reinterpret_cast::T*>(t.data_ptr()); } ================================================ FILE: frameworks/tf/__init__.py ================================================ # Copyright 2020 LMNT, Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== """ Haste: a fast, simple, and open RNN library. """ from ._version import __version__ # generated in setup.py from .gru import GRU from .gru_cell import GRUCell from .indrnn import IndRNN from .layer_norm import LayerNorm from .layer_norm_gru import LayerNormGRU from .layer_norm_gru_cell import LayerNormGRUCell from .layer_norm_indrnn import LayerNormIndRNN from .layer_norm_lstm import LayerNormLSTM from .layer_norm_lstm_cell import LayerNormLSTMCell from .lstm import LSTM from .zoneout_wrapper import ZoneoutWrapper __all__ = [ 'GRU', 'GRUCell', 'IndRNN', 'LayerNorm', 'LayerNormGRU', 'LayerNormGRUCell', 'LayerNormIndRNN', 'LayerNormLSTM', 'LayerNormLSTMCell', 'LSTM', 'ZoneoutWrapper' ] ================================================ FILE: frameworks/tf/arena.h ================================================ // Copyright 2020 LMNT, Inc. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // ============================================================================== #pragma once #include #include #include #include #include "tensorflow/core/framework/op.h" #include "tensorflow/core/framework/op_kernel.h" #include "tensorflow/core/framework/shape_inference.h" template class TensorView { public: TensorView(T* ptr, const tensorflow::TensorShape& shape) { ptr_ = ptr; leading_dim_ = shape.dim_size(0); stride_ = 1LL; for (int i = 1; i < shape.dims(); ++i) stride_ *= shape.dim_size(i); } tensorflow::int64 AllocatedBytes() const { return leading_dim_ * stride_ * sizeof(T); } T* data() { return ptr_; } T* SubSlicePtr(tensorflow::int64 i) { assert(i >= 0); assert(i < leading_dim_); return ptr_ + (i * stride_); } const tensorflow::int64 NumElements() const { return leading_dim_ * stride_; } private: T* ptr_; tensorflow::int64 leading_dim_; tensorflow::int64 stride_; }; template class Arena { public: struct Entry { std::string name; TensorView view; }; Arena(std::initializer_list map) : map_(map) {} Arena(const std::vector& map) : map_(map) {} TensorView operator[](const std::string& name) { for (auto& unit : map_) if (name == unit.name) return unit.view; assert(false && "Invalid tensor name."); } private: std::vector map_; }; template class ArenaLayout { public: struct Entry { std::string name; tensorflow::TensorShape shape; }; ArenaLayout(std::initializer_list layout) : layout_(layout) {} tensorflow::int64 NumElements() const { tensorflow::int64 total_elements = 0; for (const auto& entry : layout_) total_elements += entry.shape.num_elements(); return total_elements; } Arena Realize(T* ptr) const { std::vector::Entry> map; for (const auto& entry : layout_) { map.push_back({ entry.name, TensorView(ptr, entry.shape) }); ptr += entry.shape.num_elements(); } return Arena(map); } private: std::vector layout_; }; ================================================ FILE: frameworks/tf/base_rnn.py ================================================ # Copyright 2020 LMNT, Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== """Base RNN layer class.""" import tensorflow as tf from tensorflow.compat import v1 __all__ = [ 'BaseRNN' ] def reverse_sequence(sequence, sequence_length): """ Reverses a batched sequence in time-major order [T,N,...]. The input sequence may be padded, in which case sequence_length specifies the unpadded length of each sequence. """ if sequence_length is None: return tf.reverse(sequence, axis=[0]) return tf.reverse_sequence(sequence, sequence_length, seq_axis=0, batch_axis=1) def transpose(tensor_or_tuple, perm): """Transposes the given tensor or tuple of tensors by the same permutation.""" if isinstance(tensor_or_tuple, tuple): return tuple([tf.transpose(tensor, perm) for tensor in tensor_or_tuple]) return tf.transpose(tensor_or_tuple, perm) class BaseRNN(tf.Module): def __init__(self, rnn_class, num_units, direction, default_name, **kwargs): assert direction in ['unidirectional', 'bidirectional'] self.default_name = default_name if direction == 'bidirectional': name = kwargs.pop('name', None) super().__init__(name) self.realname = name self.fw_layer = rnn_class(num_units, name='fw', **kwargs) self.bw_layer = rnn_class(num_units, name='bw', **kwargs) else: super().__init__() self.fw_layer = rnn_class(num_units, **kwargs) self.bw_layer = None def build(self, shape): """ Creates the variables of the layer. Calling this method is optional for users of the RNN class. It is called internally with the correct shape when `__call__` is invoked. Arguments: shape: instance of `TensorShape`. """ if self.bidirectional: with self.name_scope, v1.variable_scope(self.realname, self.default_name): self.fw_layer.build(shape) self.bw_layer.build(shape) else: self.fw_layer.build(shape) @property def output_size(self): if self.bidirectional: return self.fw_layer.output_size, self.bw_layer.output_size return self.fw_layer.output_size @property def state_size(self): if self.bidirectional: return self.fw_layer.state_size, self.bw_layer.state_size return self.fw_layer.state_size def __call__(self, inputs, training, sequence_length=None, time_major=False): """ Runs the RNN layer. Arguments: inputs: Tensor, a rank 3 input tensor with shape [N,T,C] if `time_major` is `False`, or with shape [T,N,C] if `time_major` is `True`. training: bool, `True` if running in training mode, `False` if running in inference mode. sequence_length: (optional) Tensor, a rank 1 tensor with shape [N] and dtype of `tf.int32` or `tf.int64`. This tensor specifies the unpadded length of each example in the input minibatch. time_major: (optional) bool, specifies whether `input` has shape [N,T,C] (`time_major=False`) or shape [T,N,C] (`time_major=True`). Returns: A pair, `(output, state)` for unidirectional layers, or a pair `([output_fw, output_bw], [state_fw, state_bw])` for bidirectional layers. """ if not time_major: inputs = transpose(inputs, [1, 0, 2]) result, state = self.fw_layer(inputs, sequence_length, training) if self.bidirectional: inputs = reverse_sequence(inputs, sequence_length) bw_result, bw_state = self.bw_layer(inputs, sequence_length, training) result = result, reverse_sequence(bw_result, sequence_length) state = state, bw_state if not time_major: result = transpose(result, [1, 0, 2]) return result, state @property def bidirectional(self): """`True` if this is a bidirectional RNN, `False` otherwise.""" return self.bw_layer is not None ================================================ FILE: frameworks/tf/gru.cc ================================================ // Copyright 2020 LMNT, Inc. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // ============================================================================== #include #include "haste.h" #include "support.h" #include "tensorflow/core/framework/op.h" #include "tensorflow/core/framework/op_kernel.h" #include "tensorflow/core/framework/shape_inference.h" #include "tensorflow/core/util/stream_executor_util.h" #include "tensorflow/stream_executor/stream.h" using namespace tensorflow; using haste::v0::gru::BackwardPass; using haste::v0::gru::ForwardPass; using tensorflow::se::Stream; using tensorflow::shape_inference::DimensionHandle; using tensorflow::shape_inference::InferenceContext; using tensorflow::shape_inference::ShapeHandle; // Define the interface and shape function for the op. REGISTER_OP("HasteGru") .Attr("R: {float, double}") // Some real number type. .Attr("training: bool") .Attr("zoneout_prob: float") .Input("x: R") // [T,N,C] .Input("kernel: R") // [C,H*3] .Input("recurrent_kernel: R") // [H,H*3] .Input("bias: R") // [H*3] .Input("recurrent_bias: R") // [H*3] .Input("zoneout_mask: R") // [T,N,H] .Output("h: R") // [T,N,H] .Output("v: R") // [T,N,H*4] .SetShapeFn([](InferenceContext* c) { ShapeHandle input_shape; ShapeHandle kernel_shape; ShapeHandle recurrent_shape; ShapeHandle bias_shape; ShapeHandle recurrent_bias_shape; ShapeHandle zoneout_mask_shape; TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 3, &input_shape)); TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 2, &kernel_shape)); TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 2, &recurrent_shape)); TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 1, &bias_shape)); TF_RETURN_IF_ERROR(c->WithRank(c->input(4), 1, &recurrent_bias_shape)); TF_RETURN_IF_ERROR(c->WithRank(c->input(5), 3, &zoneout_mask_shape)); const DimensionHandle time_steps = c->Dim(input_shape, 0); const DimensionHandle batch_size = c->Dim(input_shape, 1); const DimensionHandle hidden_size = c->Dim(recurrent_shape, 0); DimensionHandle time_steps_plus_1; DimensionHandle hidden_size_4; TF_RETURN_IF_ERROR(c->Add(time_steps, 1, &time_steps_plus_1)); TF_RETURN_IF_ERROR(c->Multiply(hidden_size, 4, &hidden_size_4)); c->set_output(0, c->MakeShape({ time_steps_plus_1, batch_size, hidden_size })); c->set_output(1, c->MakeShape({ time_steps, batch_size, hidden_size_4 })); return Status::OK(); }); template struct HasteGruOp : public OpKernel { explicit HasteGruOp(OpKernelConstruction* context) : OpKernel(context) { OP_REQUIRES_OK(context, context->GetAttr("training", &training_)); OP_REQUIRES_OK(context, context->GetAttr("zoneout_prob", &zoneout_prob_)); } // When running on GPU, TF backs all inputs and outputs with device memory // and not host memory. We don't need to do explicit memory copies or allocations // for the inputs and outputs. void Compute(OpKernelContext* context) override { const Tensor& input = context->input(0); const Tensor& kernel = context->input(1); const Tensor& recurrent_kernel = context->input(2); const Tensor& bias = context->input(3); const Tensor& recurrent_bias = context->input(4); const Tensor& zoneout_mask = context->input(5); const auto time_steps = input.shape().dim_size(0); const auto batch_size = input.shape().dim_size(1); const auto input_size = input.shape().dim_size(2); const auto hidden_size = recurrent_kernel.shape().dim_size(0); const bool has_zoneout = zoneout_prob_ && zoneout_mask.NumElements(); const auto data_type = DataTypeToEnum::value; OP_REQUIRES(context, input_size == kernel.shape().dim_size(0), errors::InvalidArgument("input[2] and kernel[0] dimensions must match. Found ", input_size, " and ", kernel.shape().dim_size(0))); const TensorShape output_shape = { time_steps + 1, batch_size, hidden_size }; const TensorShape v_out_shape = { time_steps, batch_size, training_ ? hidden_size * 4 : 0 }; Tensor* output = nullptr; OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, &output)); Tensor* v_out = nullptr; OP_REQUIRES_OK(context, context->allocate_output(1, v_out_shape, &v_out)); Tensor tmp_Wx; const TensorShape tmp_Wx_shape = { time_steps, batch_size, hidden_size * 3}; OP_REQUIRES_OK(context, context->allocate_temp(data_type, tmp_Wx_shape, &tmp_Wx)); Tensor tmp_Rh; const TensorShape tmp_Rh_shape = { batch_size, hidden_size * 3 }; OP_REQUIRES_OK(context, context->allocate_temp(data_type, tmp_Rh_shape, &tmp_Rh)); cudaMemset(output->flat().data(), 0, output->AllocatedBytes()); ForwardPass forward( training_, batch_size, input_size, hidden_size, GetCublasHandle(context)); forward.Run( time_steps, kernel.flat().data(), recurrent_kernel.flat().data(), bias.flat().data(), recurrent_bias.flat().data(), input.flat().data(), output->flat().data(), v_out->flat().data(), tmp_Wx.flat().data(), tmp_Rh.flat().data(), has_zoneout ? zoneout_prob_ : 0.0f, has_zoneout ? zoneout_mask.flat().data() : nullptr); } private: bool training_; float zoneout_prob_; }; REGISTER_GPU_KERNEL(HasteGru, float); REGISTER_GPU_KERNEL(HasteGru, double); REGISTER_OP("HasteGruGrad") .Attr("R: {float, double}") .Input("x_t: R") // [T,C,N] .Input("kernel_t: R") // [H*3,C] .Input("recurrent_kernel_t: R") // [H*3,H] .Input("bias: R") // [H*3] .Input("recurrent_bias: R") // [H*3] .Input("h: R") // [T,N,H] .Input("v: R") // [T,N,H*4] .Input("dh_new: R") // [T,N,H] .Input("zoneout_mask: R") // [T,N,H] .Output("dx: R") // [T,N,C] .Output("dw: R") // [C,H*3] .Output("dr: R") // [H,H*3] .Output("dbx: R") // [H*3] .Output("dbr: R") // [H*3] .SetShapeFn([](InferenceContext* c) { ShapeHandle x_shape; ShapeHandle kernel_shape; ShapeHandle recurrent_kernel_shape; ShapeHandle bias_shape; ShapeHandle recurrent_bias_shape; ShapeHandle h_shape; ShapeHandle v_shape; ShapeHandle dh_new_shape; ShapeHandle zoneout_mask_shape; TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 3, &x_shape)); TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 2, &kernel_shape)); TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 2, &recurrent_kernel_shape)); TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 1, &bias_shape)); TF_RETURN_IF_ERROR(c->WithRank(c->input(4), 1, &recurrent_bias_shape)); TF_RETURN_IF_ERROR(c->WithRank(c->input(5), 3, &h_shape)); TF_RETURN_IF_ERROR(c->WithRank(c->input(6), 3, &v_shape)); TF_RETURN_IF_ERROR(c->WithRank(c->input(7), 3, &dh_new_shape)); TF_RETURN_IF_ERROR(c->WithRank(c->input(8), 3, &zoneout_mask_shape)); DimensionHandle input_size = c->Dim(x_shape, 0); DimensionHandle time_steps = c->Dim(x_shape, 1); DimensionHandle batch_size = c->Dim(x_shape, 2); DimensionHandle hidden_size = c->Dim(recurrent_kernel_shape, 1); c->set_output(0, c->MakeShape({ time_steps, batch_size, input_size })); c->set_output(1, c->MakeShape({ input_size, c->Value(hidden_size) * 3 })); c->set_output(2, c->MakeShape({ hidden_size, c->Value(hidden_size) * 3 })); c->set_output(3, bias_shape); c->set_output(4, recurrent_bias_shape); return Status::OK(); }); template struct HasteGruGradOp : public OpKernel { explicit HasteGruGradOp(OpKernelConstruction* context) : OpKernel(context) {} void Compute(OpKernelContext* context) override { const Tensor& input = context->input(0); const Tensor& kernel = context->input(1); const Tensor& recurrent_kernel = context->input(2); const Tensor& bias = context->input(3); const Tensor& recurrent_bias = context->input(4); const Tensor& h_vector = context->input(5); const Tensor& v_vector = context->input(6); const Tensor& dh_new = context->input(7); const Tensor& zoneout_mask = context->input(8); const auto input_size = input.shape().dim_size(0); const auto time_steps = input.shape().dim_size(1); const auto batch_size = input.shape().dim_size(2); const auto hidden_size = recurrent_kernel.shape().dim_size(1); const bool has_zoneout = !!zoneout_mask.NumElements(); const auto data_type = DataTypeToEnum::value; // Can be uninitialized. Output only, no accumulation. const TensorShape dx_shape = { time_steps, batch_size, input_size }; Tensor* dx = nullptr; OP_REQUIRES_OK(context, context->allocate_output(0, dx_shape, &dx)); // Needs to be initialized to 0. const TensorShape dW_shape = { input_size, hidden_size * 3 }; Tensor* dW = nullptr; OP_REQUIRES_OK(context, context->allocate_output(1, dW_shape, &dW)); // Needs to be initialized to 0. const TensorShape dR_shape = { hidden_size, hidden_size * 3 }; Tensor* dR = nullptr; OP_REQUIRES_OK(context, context->allocate_output(2, dR_shape, &dR)); // Needs to be initialized to 0. const TensorShape dbx_shape = { hidden_size * 3 }; Tensor* dbx = nullptr; OP_REQUIRES_OK(context, context->allocate_output(3, dbx_shape, &dbx)); // Needs to be initialized to 0. const TensorShape dbr_shape = { hidden_size * 3 }; Tensor* dbr = nullptr; OP_REQUIRES_OK(context, context->allocate_output(4, dbr_shape, &dbr)); // Needs to be initialized to 0. const TensorShape dh_shape = { batch_size, hidden_size }; Tensor dh; OP_REQUIRES_OK(context, context->allocate_temp(data_type, dh_shape, &dh)); // Can be uninitialized. Output only, no accumulation. const TensorShape dp_shape = { time_steps, batch_size, hidden_size * 3 }; Tensor dp; OP_REQUIRES_OK(context, context->allocate_temp(data_type, dp_shape, &dp)); // Can be uninitialized. Output only, no accumulation. const TensorShape dq_shape = { time_steps, batch_size, hidden_size * 3 }; Tensor dq; OP_REQUIRES_OK(context, context->allocate_temp(data_type, dq_shape, &dq)); cudaMemset(dW->flat().data(), 0, dW->AllocatedBytes()); cudaMemset(dR->flat().data(), 0, dR->AllocatedBytes()); cudaMemset(dbx->flat().data(), 0, dbx->AllocatedBytes()); cudaMemset(dbr->flat().data(), 0, dbr->AllocatedBytes()); cudaMemset(dh.flat().data(), 0, dh.AllocatedBytes()); BackwardPass backward( batch_size, input_size, hidden_size, GetCublasHandle(context)); backward.Run( time_steps, kernel.flat().data(), recurrent_kernel.flat().data(), bias.flat().data(), recurrent_bias.flat().data(), input.flat().data(), h_vector.flat().data(), v_vector.flat().data(), dh_new.flat().data(), dx->flat().data(), dW->flat().data(), dR->flat().data(), dbx->flat().data(), dbr->flat().data(), dh.flat().data(), dp.flat().data(), dq.flat().data(), has_zoneout ? zoneout_mask.flat().data() : nullptr); } }; REGISTER_GPU_KERNEL(HasteGruGrad, float); REGISTER_GPU_KERNEL(HasteGruGrad, double); ================================================ FILE: frameworks/tf/gru.py ================================================ # Copyright 2020 LMNT, Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== """Gated Recurrent Unit""" import pkg_resources import tensorflow as tf from tensorflow.compat import v1 from .base_rnn import BaseRNN from .weight_config import WeightConfig __all__ = [ 'GRU' ] LIB = tf.load_op_library(pkg_resources.resource_filename(__name__, 'libhaste_tf.so')) @tf.RegisterGradient("HasteGru") def gru_gradient(op, *grads): training = op.get_attr('training') if not training: raise ValueError(('GRU can only compute gradients if `training=True` was specified during the ' 'forward pass.\nFailed op: {}').format(op.name)) # Extract inputs and outputs from the op. x = op.inputs[0] W = op.inputs[1] R = op.inputs[2] bx = op.inputs[3] br = op.inputs[4] zoneout_mask = op.inputs[5] h = op.outputs[0] v = op.outputs[1] # Pre-transpose matrices for better performance. x = tf.transpose(x, [2, 0, 1]) W = tf.transpose(W, [1, 0]) R = tf.transpose(R, [1, 0]) dx, dW, dR, dbx, dbr = LIB.haste_gru_grad(x, W, R, bx, br, h, v, grads[0], zoneout_mask) return [dx, dW, dR, dbx, dbr, None] class GRULayer(tf.Module): def __init__(self, num_units, kernel_initializer=None, recurrent_initializer=None, bias_initializer=None, recurrent_bias_initializer=None, kernel_transform=None, recurrent_transform=None, bias_transform=None, recurrent_bias_transform=None, dropout=0.0, zoneout=0.0, dtype=None, name=None): super(GRULayer, self).__init__(name) self.realname = name self.num_units = num_units identity = lambda x: x self.kernel_config = WeightConfig(v1.initializers.glorot_uniform(), None, identity) self.recurrent_config = WeightConfig(v1.initializers.orthogonal(), None, identity) self.bias_config = WeightConfig(v1.initializers.zeros(), None, identity) self.recurrent_bias_config = WeightConfig(v1.initializers.zeros(), None, identity) self.kernel_config.override(kernel_initializer, None, kernel_transform) self.recurrent_config.override(recurrent_initializer, None, recurrent_transform) self.bias_config.override(bias_initializer, None, bias_transform) self.recurrent_bias_config.override(recurrent_bias_initializer, None, recurrent_bias_transform) self.dropout = dropout self.zoneout = zoneout self.dtype = dtype or tf.float32 self.built = False def build(self, shape): if self.built: return num_units = self.num_units input_size = int(shape[-1]) def build_weights(initializer, shape): weights = [initializer(shape, dtype=self.dtype) for _ in range(3)] weights = tf.concat(weights, axis=-1) return weights kernel_weights = build_weights(self.kernel_config.initializer, [input_size, num_units]) recurrent_weights = build_weights(self.recurrent_config.initializer, [num_units, num_units]) biases = build_weights(self.bias_config.initializer, [num_units]) recurrent_biases = build_weights(self.recurrent_bias_config.initializer, [num_units]) weights = tf.concat([kernel_weights, recurrent_weights], axis=0) biases = tf.concat([biases, recurrent_biases], axis=0) with self.name_scope, v1.variable_scope(self.realname, 'gru_cell'): self._kernel = v1.get_variable('kernel', initializer=weights) self._bias = v1.get_variable('bias', initializer=biases) self.built = True def get_weights(self): input_size = self._kernel.shape.as_list()[0] - self.num_units kernel, recurrent_kernel = tf.split(self._kernel, [input_size, self.num_units], axis=0) bias, recurrent_bias = tf.split(self._bias, 2, axis=0) return { 'kernel': self.kernel_config.transform(kernel), 'recurrent_kernel': self.recurrent_config.transform(recurrent_kernel), 'bias': self.bias_config.transform(bias), 'recurrent_bias': self.recurrent_bias_config.transform(recurrent_bias) } def __call__(self, inputs, sequence_length, training): self.build(inputs.shape) shape = tf.shape(inputs) time_steps = shape[0] batch_size = shape[1] # Use an empty zoneout mask if no zoneout is going to be applied. # Sadly, we can't pass `None` to the op but at least we won't be wasting # memory or bandwidth on this tensor. zoneout_mask = tf.zeros([0, 0, 0], dtype=self.dtype) if self.zoneout: zoneout_mask = 1.0 - self.zoneout zoneout_mask += tf.random.uniform([time_steps, batch_size, self.num_units], dtype=self.dtype) zoneout_mask = tf.floor(zoneout_mask) weights = self.get_weights() if training and self.dropout > 0: recurrent_kernel = tf.nn.dropout(weights['recurrent_kernel'], rate=self.dropout) else: recurrent_kernel = weights['recurrent_kernel'] result, _ = LIB.haste_gru( inputs, weights['kernel'], recurrent_kernel, weights['bias'], weights['recurrent_bias'], zoneout_mask, training=training, zoneout_prob=self.zoneout) if sequence_length is not None: # 0-indexed tensors, so length-1. indices = sequence_length indices = tf.stack([indices, tf.range(batch_size, dtype=sequence_length.dtype)], axis=-1) state = tf.gather_nd(result, indices) else: state = result[-1] return result[1:], state class GRU(BaseRNN): """ Gated Recurrent Unit layer. This GRU layer offers a fused, GPU-accelerated TensorFlow op for inference and training. There are two commonly-used variants of GRU cells. This one implements 1406.1078v1 which applies the reset gate to the hidden state after matrix multiplication. cuDNN also implements this variant. The other variant, 1406.1078v3, applies the reset gate before matrix multiplication and is currently unsupported. This layer has built-in support for DropConnect and Zoneout, which are both techniques used to regularize RNNs. """ def __init__(self, num_units, direction='unidirectional', **kwargs): """ Initialize the parameters of the GRU layer. Arguments: num_units: int, the number of units in the LSTM cell. direction: string, 'unidirectional' or 'bidirectional'. **kwargs: Dict, keyword arguments (see below). Keyword Arguments: kernel_initializer: (optional) the initializer to use for the input matrix weights. Defaults to `glorot_uniform`. recurrent_initializer: (optional) the initializer to use for the recurrent matrix weights. Defaults to `orthogonal`. bias_initializer: (optional) the initializer to use for input bias vectors. Defaults to `zeros`. recurrent_bias_initializer: (optional) the initializer to use for recurrent bias vectors. Defaults to `zeros`. kernel_transform: (optional) a function with signature `(kernel: Tensor) -> Tensor` that transforms the kernel before it is used. Defaults to the identity function. recurrent_transform: (optional) a function with signature `(recurrent_kernel: Tensor) -> Tensor` that transforms the recurrent kernel before it is used. Defaults to the identity function. bias_transform: (optional) a function with signature `(bias: Tensor) -> Tensor` that transforms the bias before it is used. Defaults to the identity function. recurrent_bias_transform: (optional) a function with signature `(recurrent_bias: Tensor) -> Tensor` that transforms the recurrent bias before it is used. Defaults to the identity function. dropout: (optional) float, sets the dropout rate for DropConnect regularization on the recurrent matrix. Defaults to 0. zoneout: (optional) float, sets the zoneout rate for Zoneout regularization. Defaults to 0. dtype: (optional) the data type for this layer. Defaults to `tf.float32`. name: (optional) string, the name for this layer. """ super().__init__(GRULayer, num_units, direction, 'gru_cell', **kwargs) ================================================ FILE: frameworks/tf/gru_cell.py ================================================ # Copyright 2020 LMNT, Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== """A GRU cell compatible with the Haste GRU layer.""" import tensorflow as tf from tensorflow.compat import v1 from tensorflow.compat.v1.nn import rnn_cell class GRUCell(rnn_cell.RNNCell): """ A GRU cell that's compatible with the Haste GRU layer. This cell can be used on hardware other than GPUs and with other TensorFlow classes that operate on RNN cells (e.g. `dynamic_rnn`, `BasicDecoder`, cell wrappers, etc.). """ def __init__(self, num_units, name=None, **kwargs): super(GRUCell, self).__init__(name=name, **kwargs) self.realname = name self.num_units = num_units self.built = False @property def state_size(self): return self.num_units @property def output_size(self): return self.num_units def build(self, shape): if self.built: return num_units = self.num_units input_size = int(shape[-1]) dtype = self.dtype or tf.float32 kernel_initializer = v1.initializers.glorot_uniform(dtype=dtype) bias_initializer = v1.initializers.zeros(dtype=dtype) with tf.name_scope(self.name), v1.variable_scope(self.realname, 'gru_cell'): self._kernel = v1.get_variable('kernel', initializer=lambda: kernel_initializer([input_size + num_units, num_units * 3])) self._bias = v1.get_variable('bias', initializer=lambda: bias_initializer([num_units * 6])) self.kernel, self.recurrent_kernel = tf.split(self._kernel, [input_size, num_units], axis=0) self.bias, self.recurrent_bias = tf.split(self._bias, 2, axis=0) self.built = True def __call__(self, inputs, state, scope=None): self.build(inputs.shape) h_proj = tf.nn.xw_plus_b(state, self.recurrent_kernel, self.recurrent_bias) x = tf.nn.xw_plus_b(inputs, self.kernel, self.bias) h_z, h_r, h_g = tf.split(h_proj, 3, axis=-1) x_z, x_r, x_g = tf.split(x, 3, axis=-1) z = tf.nn.sigmoid(h_z + x_z) r = tf.nn.sigmoid(h_r + x_r) g = tf.nn.tanh(r * h_g + x_g) h = z * state + (1 - z) * g return h, h ================================================ FILE: frameworks/tf/indrnn.cc ================================================ // Copyright 2020 LMNT, Inc. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // ============================================================================== #include #include "haste.h" #include "support.h" #include "tensorflow/core/framework/op.h" #include "tensorflow/core/framework/op_kernel.h" #include "tensorflow/core/framework/shape_inference.h" #include "tensorflow/core/util/stream_executor_util.h" #include "tensorflow/stream_executor/stream.h" using namespace tensorflow; using haste::v0::indrnn::ForwardPass; using haste::v0::indrnn::BackwardPass; using tensorflow::se::Stream; using tensorflow::shape_inference::DimensionHandle; using tensorflow::shape_inference::InferenceContext; using tensorflow::shape_inference::ShapeHandle; // Define the interface and shape function for the op. REGISTER_OP("HasteIndrnn") .Attr("R: {float, double}") // Some real number type. .Attr("training: bool") .Attr("zoneout_prob: float") .Input("x: R") // [T,N,C] .Input("kernel: R") // [C,H] .Input("recurrent_scale: R") // [H] .Input("bias: R") // [H] .Input("zoneout_mask: R") // [T,N,H] .Output("h: R") // [T+1,N,H] .SetShapeFn([](InferenceContext* c) { ShapeHandle input_shape; ShapeHandle kernel_shape; ShapeHandle recurrent_shape; ShapeHandle bias_shape; ShapeHandle zoneout_mask_shape; TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 3, &input_shape)); TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 2, &kernel_shape)); TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 1, &recurrent_shape)); TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 1, &bias_shape)); TF_RETURN_IF_ERROR(c->WithRank(c->input(4), 3, &zoneout_mask_shape)); const DimensionHandle time_steps = c->Dim(input_shape, 0); const DimensionHandle batch_size = c->Dim(input_shape, 1); const DimensionHandle hidden_size = c->Dim(recurrent_shape, 0); DimensionHandle time_steps_plus_1; TF_RETURN_IF_ERROR(c->Add(time_steps, 1, &time_steps_plus_1)); c->set_output(0, c->MakeShape({ time_steps_plus_1, batch_size, hidden_size })); return Status::OK(); }); template struct HasteIndrnnOp : public OpKernel { explicit HasteIndrnnOp(OpKernelConstruction* context) : OpKernel(context) { OP_REQUIRES_OK(context, context->GetAttr("training", &training_)); OP_REQUIRES_OK(context, context->GetAttr("zoneout_prob", &zoneout_prob_)); } void Compute(OpKernelContext* context) override { const Tensor& input = context->input(0); const Tensor& kernel = context->input(1); const Tensor& recurrent_scale = context->input(2); const Tensor& bias = context->input(3); const Tensor& zoneout_mask = context->input(4); const auto time_steps = input.shape().dim_size(0); const auto batch_size = input.shape().dim_size(1); const auto input_size = input.shape().dim_size(2); const auto hidden_size = recurrent_scale.shape().dim_size(0); const bool has_zoneout = zoneout_prob_ && zoneout_mask.NumElements(); const auto data_type = DataTypeToEnum::value; const TensorShape output_shape = { time_steps + 1, batch_size, hidden_size }; Tensor* output = nullptr; OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, &output)); Tensor workspace; const TensorShape workspace_shape = { time_steps, batch_size, hidden_size }; OP_REQUIRES_OK(context, context->allocate_temp(data_type, workspace_shape, &workspace)); cudaMemset(output->flat().data(), 0, output->AllocatedBytes()); ForwardPass forward( training_, batch_size, input_size, hidden_size, GetCublasHandle(context)); forward.Run( time_steps, kernel.flat().data(), recurrent_scale.flat().data(), bias.flat().data(), input.flat().data(), output->flat().data(), workspace.flat().data(), has_zoneout ? zoneout_prob_ : 0.0f, has_zoneout ? zoneout_mask.flat().data() : nullptr); } private: bool training_; float zoneout_prob_; }; REGISTER_GPU_KERNEL(HasteIndrnn, float); REGISTER_GPU_KERNEL(HasteIndrnn, double); REGISTER_OP("HasteIndrnnGrad") .Attr("R: {float, double}") .Input("x_t: R") // [C,N,T] .Input("kernel_t: R") // [4,C] .Input("recurrent_scale: R") // [H] .Input("bias: R") // [H] .Input("zoneout_mask: R") // [T,N,H] .Input("h: R") // [T+1,N,H] .Input("dh_new: R") // [T+1,N,H] .Output("dx: R") // [T,N,C] .Output("dw: R") // [C,H] .Output("dr: R") // [H] .Output("db: R") // [H] .SetShapeFn([](InferenceContext* c) { ShapeHandle x_shape; ShapeHandle kernel_shape; ShapeHandle recurrent_shape; ShapeHandle bias_shape; ShapeHandle zoneout_mask_shape; ShapeHandle h_shape; ShapeHandle dh_new_shape; TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 3, &x_shape)); TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 2, &kernel_shape)); TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 1, &recurrent_shape)); TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 1, &bias_shape)); TF_RETURN_IF_ERROR(c->WithRank(c->input(4), 3, &zoneout_mask_shape)); TF_RETURN_IF_ERROR(c->WithRank(c->input(5), 3, &h_shape)); TF_RETURN_IF_ERROR(c->WithRank(c->input(6), 3, &dh_new_shape)); DimensionHandle input_size = c->Dim(x_shape, 0); DimensionHandle time_steps = c->Dim(x_shape, 1); DimensionHandle batch_size = c->Dim(x_shape, 2); DimensionHandle hidden_size = c->Dim(recurrent_shape, 0); c->set_output(0, c->MakeShape({ time_steps, batch_size, input_size })); c->set_output(1, c->MakeShape({ input_size, hidden_size })); c->set_output(2, recurrent_shape); c->set_output(3, bias_shape); return Status::OK(); }); template struct HasteIndrnnGradOp : public OpKernel { explicit HasteIndrnnGradOp(OpKernelConstruction* context) : OpKernel(context) {} void Compute(OpKernelContext* context) override { const Tensor& input = context->input(0); const Tensor& kernel = context->input(1); const Tensor& recurrent_scale = context->input(2); const Tensor& bias = context->input(3); const Tensor& zoneout_mask = context->input(4); const Tensor& h_vector = context->input(5); const Tensor& dh_new = context->input(6); const auto input_size = input.shape().dim_size(0); const auto time_steps = input.shape().dim_size(1); const auto batch_size = input.shape().dim_size(2); const auto hidden_size = recurrent_scale.shape().dim_size(0); const bool has_zoneout = !!zoneout_mask.NumElements(); const auto data_type = DataTypeToEnum::value; // Can be uninitialized. Output only, no accumulation. const TensorShape dx_shape = { time_steps, batch_size, input_size }; Tensor* dx = nullptr; OP_REQUIRES_OK(context, context->allocate_output(0, dx_shape, &dx)); // Needs to be initialized to 0. const TensorShape dW_shape = { input_size, hidden_size }; Tensor* dW = nullptr; OP_REQUIRES_OK(context, context->allocate_output(1, dW_shape, &dW)); // Needs to be initialized to 0. const TensorShape du_shape = { hidden_size }; Tensor* du = nullptr; OP_REQUIRES_OK(context, context->allocate_output(2, du_shape, &du)); // Needs to be initialized to 0. const TensorShape db_shape = { hidden_size }; Tensor* db = nullptr; OP_REQUIRES_OK(context, context->allocate_output(3, db_shape, &db)); // Needs to be initialized to 0. const TensorShape dh_shape = { batch_size, hidden_size }; Tensor dh; OP_REQUIRES_OK(context, context->allocate_temp(data_type, dh_shape, &dh)); const TensorShape workspace_shape = { time_steps, batch_size, hidden_size }; Tensor workspace; OP_REQUIRES_OK(context, context->allocate_temp(data_type, workspace_shape, &workspace)); cudaMemset(dW->flat().data(), 0, dW->AllocatedBytes()); cudaMemset(du->flat().data(), 0, du->AllocatedBytes()); cudaMemset(db->flat().data(), 0, db->AllocatedBytes()); cudaMemset(dh.flat().data(), 0, dh.AllocatedBytes()); BackwardPass backward = BackwardPass( batch_size, input_size, hidden_size, GetCublasHandle(context)); backward.Run( time_steps, kernel.flat().data(), recurrent_scale.flat().data(), bias.flat().data(), input.flat().data(), h_vector.flat().data(), dh_new.flat().data(), dx->flat().data(), dW->flat().data(), du->flat().data(), db->flat().data(), dh.flat().data(), workspace.flat().data(), has_zoneout ? zoneout_mask.flat().data() : nullptr); } }; REGISTER_GPU_KERNEL(HasteIndrnnGrad, float); REGISTER_GPU_KERNEL(HasteIndrnnGrad, double); ================================================ FILE: frameworks/tf/indrnn.py ================================================ # Copyright 2020 LMNT, Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== """Independently Recurrent Neural Network""" import pkg_resources import tensorflow as tf from tensorflow.compat import v1 from tensorflow.compat.v1.nn import rnn_cell from .base_rnn import BaseRNN from .weight_config import WeightConfig __all__ = [ 'IndRNN' ] LIB = tf.load_op_library(pkg_resources.resource_filename(__name__, 'libhaste_tf.so')) @tf.RegisterGradient("HasteIndrnn") def indrnn_gradient(op, *grads): training = op.get_attr('training') if not training: raise ValueError(('IndRNN can only compute gradients if `training=True` was specified during ' 'the forward pass.\nFailed op: {}').format(op.name)) # Extract inputs and outputs from the op. x = op.inputs[0] W = op.inputs[1] u = op.inputs[2] b = op.inputs[3] zoneout_mask = op.inputs[4] h = op.outputs[0] # Pre-transpose matrices for better performance. x = tf.transpose(x, [2, 0, 1]) W = tf.transpose(W, [1, 0]) dx, dW, du, db = LIB.haste_indrnn_grad(x, W, u, b, zoneout_mask, h, grads[0]) return [dx, dW, du, db, None] def _get_initializer(initializer): if not isinstance(initializer, dict): return initializer if 'uniform' in initializer: value = initializer['uniform'] return v1.initializers.random_uniform(-value, value) if 'normal' in initializer: value = initializer['normal'] return v1.initializers.truncated_normal(stddev=value) raise ValueError(f'Unknown initializer {initializer}') class IndRNNLayer(tf.Module): def __init__(self, num_units, kernel_initializer=None, recurrent_initializer=None, bias_initializer=None, kernel_transform=None, recurrent_transform=None, bias_transform=None, zoneout=0.0, dtype=None, name=None): super().__init__(name) self.realname = name self.num_units = num_units identity = lambda x: x self.kernel_config = WeightConfig(v1.initializers.glorot_uniform(), None, identity) self.recurrent_config = WeightConfig(v1.initializers.random_uniform(-0.5, 0.5), None, identity) self.bias_config = WeightConfig(v1.initializers.zeros(), None, identity) self.kernel_config.override(_get_initializer(kernel_initializer), None, kernel_transform) self.recurrent_config.override(_get_initializer(recurrent_initializer), None, recurrent_transform) self.bias_config.override(_get_initializer(bias_initializer), None, bias_transform) self.zoneout = zoneout self.dtype = dtype or tf.float32 self.kernel = None self.recurrent_scale = None self.bias = None self.recurrent_bias = None self.built = False def build(self, shape): if self.built: return num_units = self.num_units input_size = int(shape[-1]) kernel_shape = tf.TensorShape([input_size, num_units]) recurrent_shape = tf.TensorShape([num_units]) bias_shape = tf.TensorShape([num_units]) kernel_weights = self.kernel_config.initializer(kernel_shape, dtype=self.dtype) recurrent_weights = self.recurrent_config.initializer(recurrent_shape, dtype=self.dtype) biases = self.bias_config.initializer(bias_shape) with self.name_scope, v1.variable_scope(self.realname, 'indrnn_cell'): self.kernel = v1.get_variable('kernel', initializer=kernel_weights) self.recurrent_scale = v1.get_variable('recurrent_scale', initializer=recurrent_weights) self.bias = v1.get_variable('bias', initializer=biases) self.built = True def get_weights(self): return { 'kernel': self.kernel_config.transform(self.kernel), 'recurrent_scale': self.recurrent_config.transform(self.recurrent_scale), 'bias': self.bias_config.transform(self.bias) } def __call__(self, inputs, sequence_length, training): self.build(inputs.shape) shape = tf.shape(inputs) time_steps = shape[0] batch_size = shape[1] # Use an empty zoneout mask if no zoneout is going to be applied. # Sadly, we can't pass `None` to the op but at least we won't be wasting # memory or bandwidth on this tensor. zoneout_mask = tf.zeros([0, 0, 0], dtype=self.dtype) if self.zoneout: zoneout_mask = 1.0 - self.zoneout zoneout_mask += tf.random.uniform([time_steps, batch_size, self.num_units], dtype=self.dtype) zoneout_mask = tf.floor(zoneout_mask) weights = self.get_weights() result = LIB.haste_indrnn( inputs, weights['kernel'], weights['recurrent_scale'], weights['bias'], zoneout_mask, training=training, zoneout_prob=self.zoneout) if sequence_length is not None: # 0-indexed tensors, so length-1. indices = sequence_length indices = tf.stack([indices, tf.range(batch_size, dtype=sequence_length.dtype)], axis=-1) state = tf.gather_nd(result, indices) else: state = result[-1] return result[1:], state class IndRNN(BaseRNN): """ Independently Recurrent Neural Network layer. This layer offers a fused, GPU-accelerated TensorFlow op for inference and training. It also supports Zoneout regularization. """ def __init__(self, num_units, direction='unidirectional', **kwargs): """ Initialize the parameters of the IndRNN layer. Arguments: num_units: int, the number of units in the IndRNN cell. direction: string, 'unidirectional' or 'bidirectional'. **kwargs: Dict, keyword arguments (see below). Keyword Arguments: kernel_initializer: (optional) the initializer to use for the input matrix weights. Defaults to `glorot_uniform`. recurrent_initializer: (optional) the initializer to use for the recurrent scale weights. Defaults to uniform random in [-0.5, 0.5]. Note that this initialization scheme is different than in the original authors' implementation. See https://github.com/lmnt-com/haste/issues/7 for details. bias_initializer: (optional) the initializer to use for the bias vector. Defaults to `zeros`. kernel_transform: (optional) a function with signature `(kernel: Tensor) -> Tensor` that transforms the kernel before it is used. Defaults to the identity function. recurrent_transform: (optional) a function with signature `(recurrent_scale: Tensor) -> Tensor` that transforms the recurrent scale vector before it is used. Defaults to the identity function. bias_transform: (optional) a function with signature `(bias: Tensor) -> Tensor` that transforms the bias before it is used. Defaults to the identity function. zoneout: (optional) float, sets the zoneout rate for Zoneout regularization. Defaults to 0. dtype: (optional) the data type for this layer. Defaults to `tf.float32`. name: (optional) string, the name for this layer. """ super().__init__(IndRNNLayer, num_units, direction, 'indrnn_cell', **kwargs) ================================================ FILE: frameworks/tf/layer_norm.cc ================================================ // Copyright 2020 LMNT, Inc. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // ============================================================================== #include "haste.h" #include "support.h" #include "tensorflow/core/framework/op.h" #include "tensorflow/core/framework/op_kernel.h" #include "tensorflow/core/framework/shape_inference.h" #include "tensorflow/core/util/stream_executor_util.h" #include "tensorflow/stream_executor/stream.h" using namespace tensorflow; using haste::v0::layer_norm::ForwardPass; using haste::v0::layer_norm::BackwardPass; using tensorflow::se::Stream; using tensorflow::shape_inference::DimensionHandle; using tensorflow::shape_inference::InferenceContext; using tensorflow::shape_inference::ShapeHandle; REGISTER_OP("HasteLayerNorm") .Attr("R: {float, double}") .Input("x: R") .Input("gamma: R") .Input("beta: R") .Output("y: R") .Output("cache: R") .SetShapeFn([](InferenceContext* c) { ShapeHandle input_shape; ShapeHandle gamma_shape; ShapeHandle beta_shape; TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 2, &input_shape)); TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 1, &gamma_shape)); TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 1, &beta_shape)); c->set_output(0, input_shape); c->set_output(1, c->UnknownShapeOfRank(2)); return Status::OK(); }); template struct HasteLayerNormOp : public OpKernel { explicit HasteLayerNormOp(OpKernelConstruction* context) : OpKernel(context) {} void Compute(OpKernelContext* context) override { const Tensor& x = context->input(0); const Tensor& gamma = context->input(1); const Tensor& beta = context->input(2); const auto batch_size = x.shape().dim_size(0); const auto hidden_size = x.shape().dim_size(1); Tensor* y = nullptr; OP_REQUIRES_OK(context, context->allocate_output(0, x.shape(), &y)); Tensor* cache = nullptr; OP_REQUIRES_OK(context, context->allocate_output(1, { batch_size, 2 }, &cache)); ForwardPass forward( batch_size, hidden_size, gamma.flat().data(), beta.shape().dim_size(0) ? beta.flat().data() : nullptr, cache->flat().data()); forward.Run(GetCudaStream(context), x.flat().data(), y->flat().data()); } }; REGISTER_GPU_KERNEL(HasteLayerNorm, float); REGISTER_GPU_KERNEL(HasteLayerNorm, double); REGISTER_OP("HasteLayerNormGrad") .Attr("R: {float, double}") .Input("x: R") .Input("gamma: R") .Input("beta: R") .Input("dy: R") .Input("cache: R") .Output("dx: R") .Output("dgamma: R") .Output("dbeta: R") .SetShapeFn([](InferenceContext* c) { ShapeHandle x_shape; ShapeHandle gamma_shape; ShapeHandle beta_shape; ShapeHandle dy_shape; ShapeHandle cache_shape; TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 2, &x_shape)); TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 1, &gamma_shape)); TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 1, &beta_shape)); TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 2, &dy_shape)); TF_RETURN_IF_ERROR(c->WithRank(c->input(4), 2, &cache_shape)); c->set_output(0, x_shape); c->set_output(1, gamma_shape); c->set_output(2, beta_shape); return Status::OK(); }); template struct HasteLayerNormGradOp : public OpKernel { explicit HasteLayerNormGradOp(OpKernelConstruction* context) : OpKernel(context) {} void Compute(OpKernelContext* context) override { const Tensor& x = context->input(0); const Tensor& gamma = context->input(1); const Tensor& beta = context->input(2); const Tensor& dy = context->input(3); const Tensor& cache = context->input(4); const auto batch_size = x.shape().dim_size(0); const auto hidden_size = x.shape().dim_size(1); const auto cache_shape = context->input(4).shape(); const auto data_type = DataTypeToEnum::value; Tensor* dx = nullptr; OP_REQUIRES_OK(context, context->allocate_output(0, x.shape(), &dx)); Tensor* dgamma = nullptr; OP_REQUIRES_OK(context, context->allocate_output(1, gamma.shape(), &dgamma)); Tensor* dbeta = nullptr; OP_REQUIRES_OK(context, context->allocate_output(2, beta.shape(), &dbeta)); cudaMemset(dgamma->flat().data(), 0, dgamma->AllocatedBytes()); cudaMemset(dbeta->flat().data(), 0, dbeta->AllocatedBytes()); BackwardPass backward( batch_size, hidden_size, gamma.flat().data(), beta.shape().dim_size(0) ? beta.flat().data() : nullptr, x.flat().data(), dgamma->flat().data(), beta.shape().dim_size(0) ? dbeta->flat().data() : nullptr, const_cast(cache.flat().data())); backward.Run(GetCudaStream(context), dy.flat().data(), dx->flat().data()); } }; REGISTER_GPU_KERNEL(HasteLayerNormGrad, float); REGISTER_GPU_KERNEL(HasteLayerNormGrad, double); ================================================ FILE: frameworks/tf/layer_norm.py ================================================ # Copyright 2020 LMNT, Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== """Layer Normalization""" import pkg_resources import tensorflow as tf from tensorflow.compat import v1 __all__ = [ 'LayerNorm' ] LIB = tf.load_op_library(pkg_resources.resource_filename(__name__, 'libhaste_tf.so')) @tf.RegisterGradient("HasteLayerNorm") def layer_norm_gradient(op, *grads): x = op.inputs[0] gamma = op.inputs[1] beta = op.inputs[2] cache = op.outputs[1] return LIB.haste_layer_norm_grad(x, gamma, beta, grads[0], cache) class LayerNorm(tf.Module): """ Layer normalization layer. This class exposes a fused and GPU-accelerated implementation of layer normalization as described by [Ba et al.](https://arxiv.org/abs/1607.06450) """ def __init__(self, name=None): """ Initialize the parameters of the layer normalization layer. Arguments: name: (optional) string, the name for this layer. """ super(LayerNorm, self).__init__(name) self.realname = name self.gamma = None self.beta = None self.built = False def build(self, shape): """ Creates the variables of the layer. Calling this method is optional for users of the LayerNorm class. It is called internally with the correct shape when `__call__` is invoked. Arguments: shape: instance of `TensorShape`. """ if self.built: return hidden_size = int(shape[-1]) with self.name_scope, v1.variable_scope(self.realname, 'layer_norm'): self.gamma = v1.get_variable('gamma', shape=[hidden_size], initializer=v1.initializers.ones()) self.beta = v1.get_variable('beta', shape=[hidden_size], initializer=v1.initializers.zeros()) self.built = True def __call__(self, x): """ Runs the layer. Arguments: x: Tensor, a rank R tensor. Returns: y: Tensor, a rank R tensor with the last dimension normalized. """ self.build(x.shape) y, _ = LIB.haste_layer_norm(x, self.gamma, self.beta) return y ================================================ FILE: frameworks/tf/layer_norm_gru.cc ================================================ // Copyright 2020 LMNT, Inc. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // ============================================================================== #include #include "arena.h" #include "haste.h" #include "support.h" #include "tensorflow/core/framework/op.h" #include "tensorflow/core/framework/op_kernel.h" #include "tensorflow/core/framework/shape_inference.h" #include "tensorflow/core/util/stream_executor_util.h" #include "tensorflow/stream_executor/stream.h" using namespace tensorflow; using tensorflow::se::Stream; using tensorflow::shape_inference::DimensionHandle; using tensorflow::shape_inference::InferenceContext; using tensorflow::shape_inference::ShapeHandle; namespace layer_norm = haste::v0::layer_norm; namespace layer_norm_gru = haste::v0::layer_norm_gru; // Define the interface and shape function for the op. REGISTER_OP("HasteLayerNormGru") .Attr("R: {float, double}") // Some real number type. .Attr("training: bool") .Attr("zoneout_prob: float") .Input("x: R") // [T,N,C] .Input("kernel: R") // [C,H*3] .Input("recurrent_kernel: R") // [H,H*3] .Input("bias: R") // [H*3] .Input("recurrent_bias: R") // [H*3] .Input("gamma: R") // [2,H*3] .Input("zoneout_mask: R") // [T,N,H] .Output("h: R") // [T,N,H] .Output("v: R") // [T,N,H*4] .SetShapeFn([](InferenceContext* c) { ShapeHandle input_shape; ShapeHandle kernel_shape; ShapeHandle recurrent_shape; ShapeHandle bias_shape; ShapeHandle recurrent_bias_shape; ShapeHandle gamma_shape; ShapeHandle zoneout_mask_shape; TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 3, &input_shape)); TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 2, &kernel_shape)); TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 2, &recurrent_shape)); TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 1, &bias_shape)); TF_RETURN_IF_ERROR(c->WithRank(c->input(4), 1, &recurrent_bias_shape)); TF_RETURN_IF_ERROR(c->WithRank(c->input(5), 2, &gamma_shape)); TF_RETURN_IF_ERROR(c->WithRank(c->input(6), 3, &zoneout_mask_shape)); const DimensionHandle time_steps = c->Dim(input_shape, 0); const DimensionHandle batch_size = c->Dim(input_shape, 1); const DimensionHandle hidden_size = c->Dim(recurrent_shape, 0); DimensionHandle time_steps_plus_1; TF_RETURN_IF_ERROR(c->Add(time_steps, 1, &time_steps_plus_1)); c->set_output(0, c->MakeShape({ time_steps_plus_1, batch_size, hidden_size })); c->set_output(1, c->UnknownShapeOfRank(1)); return Status::OK(); }); template struct HasteLayerNormGruOp : public OpKernel { explicit HasteLayerNormGruOp(OpKernelConstruction* context) : OpKernel(context) { OP_REQUIRES_OK(context, context->GetAttr("training", &training_)); OP_REQUIRES_OK(context, context->GetAttr("zoneout_prob", &zoneout_prob_)); } // When running on GPU, TF backs all inputs and outputs with device memory // and not host memory. We don't need to do explicit memory copies or allocations // for the inputs and outputs. void Compute(OpKernelContext* context) override { const Tensor& input = context->input(0); const Tensor& kernel = context->input(1); const Tensor& recurrent_kernel = context->input(2); const Tensor& bias = context->input(3); const Tensor& recurrent_bias = context->input(4); const Tensor& gamma = context->input(5); const Tensor& zoneout_mask = context->input(6); const auto time_steps = input.shape().dim_size(0); const auto batch_size = input.shape().dim_size(1); const auto input_size = input.shape().dim_size(2); const auto hidden_size = recurrent_kernel.shape().dim_size(0); const bool has_zoneout = zoneout_prob_ && zoneout_mask.NumElements(); const auto data_type = DataTypeToEnum::value; OP_REQUIRES(context, input_size == kernel.shape().dim_size(0), errors::InvalidArgument("input[2] and kernel[0] dimensions must match. Found ", input_size, " and ", kernel.shape().dim_size(0))); const TensorShape output_shape = { time_steps + 1, batch_size, hidden_size }; Tensor* output = nullptr; OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, &output)); const ArenaLayout memory_layout = { { "cache", { time_steps, batch_size, hidden_size * 4 } }, { "act_Wx", { time_steps, batch_size, hidden_size * 3 } }, { "act_Wx_norm_cache", { time_steps, batch_size, 2 } }, { "act_Rh", { time_steps, batch_size, hidden_size * 3 } }, { "act_Rh_norm_cache", { time_steps, batch_size, 2 } }, }; Tensor* output_cache = nullptr; OP_REQUIRES_OK(context, context->allocate_output(1, { memory_layout.NumElements() }, &output_cache)); Arena memory = memory_layout.Realize(output_cache->flat().data()); TensorView cache = memory["cache"]; TensorView act_Wx = memory["act_Wx"]; TensorView act_Wx_norm_cache = memory["act_Wx_norm_cache"]; TensorView act_Rh = memory["act_Rh"]; TensorView act_Rh_norm_cache = memory["act_Rh_norm_cache"]; const TensorShape tmp_Wx_norm_shape = { time_steps, batch_size, hidden_size * 3 }; Tensor tmp_Wx_norm; OP_REQUIRES_OK(context, context->allocate_temp(data_type, tmp_Wx_norm_shape, &tmp_Wx_norm)); const TensorShape tmp_Rh_norm_shape = { batch_size, hidden_size * 3 }; Tensor tmp_Rh_norm; OP_REQUIRES_OK(context, context->allocate_temp(data_type, tmp_Rh_norm_shape, &tmp_Rh_norm)); cudaMemset(output->flat().data(), 0, output->AllocatedBytes()); layer_norm::ForwardPass layer_norm1( time_steps * batch_size, hidden_size * 3, gamma.SubSlice(0).unaligned_flat().data(), nullptr, act_Wx_norm_cache.data()); layer_norm::ForwardPass layer_norm2( time_steps * batch_size, hidden_size * 3, gamma.SubSlice(1).unaligned_flat().data(), nullptr, act_Rh_norm_cache.data()); layer_norm_gru::ForwardPass gru( training_, batch_size, input_size, hidden_size, GetCublasHandle(context)); gru.Run( time_steps, kernel.flat().data(), recurrent_kernel.flat().data(), bias.flat().data(), recurrent_bias.flat().data(), input.flat().data(), output->flat().data(), cache.data(), act_Wx.data(), layer_norm1, tmp_Wx_norm.flat().data(), act_Rh.data(), layer_norm2, tmp_Rh_norm.flat().data(), has_zoneout ? zoneout_prob_ : 0.0f, has_zoneout ? zoneout_mask.flat().data() : nullptr); } private: bool training_; float zoneout_prob_; }; REGISTER_GPU_KERNEL(HasteLayerNormGru, float); REGISTER_GPU_KERNEL(HasteLayerNormGru, double); REGISTER_OP("HasteLayerNormGruGrad") .Attr("R: {float, double}") .Input("x_t: R") // [T,C,N] .Input("kernel_t: R") // [H*3,C] .Input("recurrent_kernel_t: R") // [H*3,H] .Input("bias: R") // [H*3] .Input("recurrent_bias: R") // [H*3] .Input("gamma: R") // [2,H*3] .Input("h: R") // [T,N,H] .Input("cache: R") // [?] .Input("dh_new: R") // [T,N,H] .Input("zoneout_mask: R") // [T,N,H] .Output("dx: R") // [T,N,C] .Output("dw: R") // [C,H*3] .Output("dr: R") // [H,H*3] .Output("dbx: R") // [H*3] .Output("dbr: R") // [H*3] .Output("dgamma: R") .SetShapeFn([](InferenceContext* c) { ShapeHandle x_shape; ShapeHandle kernel_shape; ShapeHandle recurrent_kernel_shape; ShapeHandle bias_shape; ShapeHandle recurrent_bias_shape; ShapeHandle gamma_shape; ShapeHandle h_shape; ShapeHandle cache_shape; ShapeHandle dh_new_shape; ShapeHandle zoneout_mask_shape; TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 3, &x_shape)); TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 2, &kernel_shape)); TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 2, &recurrent_kernel_shape)); TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 1, &bias_shape)); TF_RETURN_IF_ERROR(c->WithRank(c->input(4), 1, &recurrent_bias_shape)); TF_RETURN_IF_ERROR(c->WithRank(c->input(5), 2, &gamma_shape)); TF_RETURN_IF_ERROR(c->WithRank(c->input(6), 3, &h_shape)); TF_RETURN_IF_ERROR(c->WithRank(c->input(7), 1, &cache_shape)); TF_RETURN_IF_ERROR(c->WithRank(c->input(8), 3, &dh_new_shape)); TF_RETURN_IF_ERROR(c->WithRank(c->input(9), 3, &zoneout_mask_shape)); DimensionHandle input_size = c->Dim(x_shape, 0); DimensionHandle time_steps = c->Dim(x_shape, 1); DimensionHandle batch_size = c->Dim(x_shape, 2); DimensionHandle hidden_size = c->Dim(recurrent_kernel_shape, 1); c->set_output(0, c->MakeShape({ time_steps, batch_size, input_size })); c->set_output(1, c->MakeShape({ input_size, c->Value(hidden_size) * 3 })); c->set_output(2, c->MakeShape({ hidden_size, c->Value(hidden_size) * 3 })); c->set_output(3, bias_shape); c->set_output(4, recurrent_bias_shape); c->set_output(5, gamma_shape); return Status::OK(); }); template struct HasteLayerNormGruGradOp : public OpKernel { explicit HasteLayerNormGruGradOp(OpKernelConstruction* context) : OpKernel(context) {} void Compute(OpKernelContext* context) override { const Tensor& input = context->input(0); const Tensor& kernel = context->input(1); const Tensor& recurrent_kernel = context->input(2); const Tensor& bias = context->input(3); const Tensor& recurrent_bias = context->input(4); const Tensor& gamma = context->input(5); const Tensor& h_vector = context->input(6); const Tensor& cache_input = context->input(7); const Tensor& dh_new = context->input(8); const Tensor& zoneout_mask = context->input(9); const auto input_size = input.shape().dim_size(0); const auto time_steps = input.shape().dim_size(1); const auto batch_size = input.shape().dim_size(2); const auto hidden_size = recurrent_kernel.shape().dim_size(1); const bool has_zoneout = !!zoneout_mask.NumElements(); const auto data_type = DataTypeToEnum::value; const ArenaLayout memory_layout = { { "cache", { time_steps, batch_size, hidden_size * 4 } }, { "act_Wx", { time_steps, batch_size, hidden_size * 3 } }, { "act_Wx_norm_cache", { time_steps, batch_size, 2 } }, { "act_Rh", { time_steps, batch_size, hidden_size * 3 } }, { "act_Rh_norm_cache", { time_steps, batch_size, 2 } }, }; assert(cache_input.shape().num_elements() == memory_layout.NumElements()); Arena memory = memory_layout.Realize(const_cast(cache_input.flat().data())); TensorView cache = memory["cache"]; TensorView act_Wx = memory["act_Wx"]; TensorView act_Wx_norm_cache = memory["act_Wx_norm_cache"]; TensorView act_Rh = memory["act_Rh"]; TensorView act_Rh_norm_cache = memory["act_Rh_norm_cache"]; // Can be uninitialized. Output only, no accumulation. const TensorShape dx_shape = { time_steps, batch_size, input_size }; Tensor* dx = nullptr; OP_REQUIRES_OK(context, context->allocate_output(0, dx_shape, &dx)); // Needs to be initialized to 0. const TensorShape dW_shape = { input_size, hidden_size * 3 }; Tensor* dW = nullptr; OP_REQUIRES_OK(context, context->allocate_output(1, dW_shape, &dW)); // Needs to be initialized to 0. const TensorShape dR_shape = { hidden_size, hidden_size * 3 }; Tensor* dR = nullptr; OP_REQUIRES_OK(context, context->allocate_output(2, dR_shape, &dR)); // Needs to be initialized to 0. const TensorShape dbx_shape = { hidden_size * 3 }; Tensor* dbx = nullptr; OP_REQUIRES_OK(context, context->allocate_output(3, dbx_shape, &dbx)); // Needs to be initialized to 0. const TensorShape dbr_shape = { hidden_size * 3 }; Tensor* dbr = nullptr; OP_REQUIRES_OK(context, context->allocate_output(4, dbr_shape, &dbr)); // Needs to be initialized to 0. Tensor* dgamma = nullptr; OP_REQUIRES_OK(context, context->allocate_output(5, gamma.shape(), &dgamma)); // Needs to be initialized to 0. const TensorShape dh_shape = { batch_size, hidden_size }; Tensor dh; OP_REQUIRES_OK(context, context->allocate_temp(data_type, dh_shape, &dh)); // Can be uninitialized. Output only, no accumulation. const TensorShape dp_shape = { time_steps, batch_size, hidden_size * 3 }; Tensor dp; OP_REQUIRES_OK(context, context->allocate_temp(data_type, dp_shape, &dp)); // Can be uninitialized. Output only, no accumulation. const TensorShape dq_shape = { time_steps, batch_size, hidden_size * 3 }; Tensor dq; OP_REQUIRES_OK(context, context->allocate_temp(data_type, dq_shape, &dq)); cudaMemset(dW->flat().data(), 0, dW->AllocatedBytes()); cudaMemset(dR->flat().data(), 0, dR->AllocatedBytes()); cudaMemset(dbx->flat().data(), 0, dbx->AllocatedBytes()); cudaMemset(dbr->flat().data(), 0, dbr->AllocatedBytes()); cudaMemset(dh.flat().data(), 0, dh.AllocatedBytes()); cudaMemset(dgamma->flat().data(), 0, dgamma->AllocatedBytes()); layer_norm::BackwardPass layer_norm1( time_steps * batch_size, hidden_size * 3, gamma.SubSlice(0).unaligned_flat().data(), nullptr, act_Wx.data(), dgamma->SubSlice(0).unaligned_flat().data(), nullptr, act_Wx_norm_cache.data()); layer_norm::BackwardPass layer_norm2( time_steps * batch_size, hidden_size * 3, gamma.SubSlice(1).unaligned_flat().data(), nullptr, act_Rh.data(), dgamma->SubSlice(1).unaligned_flat().data(), nullptr, act_Rh_norm_cache.data()); layer_norm_gru::BackwardPass gru( batch_size, input_size, hidden_size, GetCublasHandle(context)); gru.Run( time_steps, kernel.flat().data(), recurrent_kernel.flat().data(), bias.flat().data(), recurrent_bias.flat().data(), input.flat().data(), h_vector.flat().data(), cache.data(), dh_new.flat().data(), dx->flat().data(), dW->flat().data(), dR->flat().data(), dbx->flat().data(), dbr->flat().data(), dh.flat().data(), dp.flat().data(), dq.flat().data(), layer_norm1, layer_norm2, has_zoneout ? zoneout_mask.flat().data() : nullptr); } }; REGISTER_GPU_KERNEL(HasteLayerNormGruGrad, float); REGISTER_GPU_KERNEL(HasteLayerNormGruGrad, double); ================================================ FILE: frameworks/tf/layer_norm_gru.py ================================================ # Copyright 2020 LMNT, Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== """Layer Normalized Gated Recurrent Unit""" import pkg_resources import tensorflow as tf from tensorflow.compat import v1 from .base_rnn import BaseRNN from .weight_config import WeightConfig __all__ = [ 'LayerNormGRU' ] LIB = tf.load_op_library(pkg_resources.resource_filename(__name__, 'libhaste_tf.so')) @tf.RegisterGradient("HasteLayerNormGru") def layer_norm_gru_gradient(op, *grads): training = op.get_attr('training') if not training: raise ValueError(('LayerNormGRU can only compute gradients if `training=True` was specified ' 'during the forward pass.\nFailed op: {}').format(op.name)) # Extract inputs and outputs from the op. x = op.inputs[0] W = op.inputs[1] R = op.inputs[2] bx = op.inputs[3] br = op.inputs[4] gamma = op.inputs[5] zoneout_mask = op.inputs[6] h = op.outputs[0] cache = op.outputs[1] # Pre-transpose matrices for better performance. x = tf.transpose(x, [2, 0, 1]) W = tf.transpose(W, [1, 0]) R = tf.transpose(R, [1, 0]) grads = LIB.haste_layer_norm_gru_grad(x, W, R, bx, br, gamma, h, cache, grads[0], zoneout_mask) return [*grads, None] class LayerNormGRULayer(tf.Module): def __init__(self, num_units, kernel_initializer=None, recurrent_initializer=None, bias_initializer=None, recurrent_bias_initializer=None, kernel_transform=None, recurrent_transform=None, bias_transform=None, recurrent_bias_transform=None, dropout=0.0, zoneout=0.0, dtype=None, name=None): super(LayerNormGRULayer, self).__init__(name) self.realname = name self.num_units = num_units identity = lambda x: x self.kernel_config = WeightConfig(v1.initializers.glorot_uniform(), None, identity) self.recurrent_config = WeightConfig(v1.initializers.orthogonal(), None, identity) self.bias_config = WeightConfig(v1.initializers.zeros(), None, identity) self.recurrent_bias_config = WeightConfig(v1.initializers.zeros(), None, identity) self.kernel_config.override(kernel_initializer, None, kernel_transform) self.recurrent_config.override(recurrent_initializer, None, recurrent_transform) self.bias_config.override(bias_initializer, None, bias_transform) self.recurrent_bias_config.override(recurrent_bias_initializer, None, recurrent_bias_transform) self.dropout = dropout self.zoneout = zoneout self.dtype = dtype or tf.float32 self.built = False def build(self, shape): if self.built: return num_units = self.num_units input_size = int(shape[-1]) def build_weights(initializer, shape): weights = [initializer(shape, dtype=self.dtype) for _ in range(3)] weights = tf.concat(weights, axis=-1) return weights kernel_weights = build_weights(self.kernel_config.initializer, [input_size, num_units]) recurrent_weights = build_weights(self.recurrent_config.initializer, [num_units, num_units]) biases = build_weights(self.bias_config.initializer, [num_units]) recurrent_biases = build_weights(self.recurrent_bias_config.initializer, [num_units]) weights = tf.concat([kernel_weights, recurrent_weights], axis=0) biases = tf.concat([biases, recurrent_biases], axis=0) with self.name_scope, v1.variable_scope(self.realname, 'gru_cell'): self._kernel = v1.get_variable('kernel', initializer=weights) self._bias = v1.get_variable('bias', initializer=biases) self.gamma = v1.get_variable('gamma', shape=[2, self.num_units * 3], initializer=v1.initializers.ones()) self.built = True def get_weights(self): input_size = self._kernel.shape.as_list()[0] - self.num_units kernel, recurrent_kernel = tf.split(self._kernel, [input_size, self.num_units], axis=0) bias, recurrent_bias = tf.split(self._bias, 2, axis=0) return { 'kernel': self.kernel_config.transform(kernel), 'recurrent_kernel': self.recurrent_config.transform(recurrent_kernel), 'bias': self.bias_config.transform(bias), 'recurrent_bias': self.recurrent_bias_config.transform(recurrent_bias), 'gamma': self.gamma, } def __call__(self, inputs, sequence_length, training): self.build(inputs.shape) shape = tf.shape(inputs) time_steps = shape[0] batch_size = shape[1] # Use an empty zoneout mask if no zoneout is going to be applied. # Sadly, we can't pass `None` to the op but at least we won't be wasting # memory or bandwidth on this tensor. zoneout_mask = tf.zeros([0, 0, 0], dtype=self.dtype) if self.zoneout: zoneout_mask = 1.0 - self.zoneout zoneout_mask += tf.random.uniform([time_steps, batch_size, self.num_units], dtype=self.dtype) zoneout_mask = tf.floor(zoneout_mask) weights = self.get_weights() if training and self.dropout > 0: recurrent_kernel = tf.nn.dropout(weights['recurrent_kernel'], rate=self.dropout) else: recurrent_kernel = weights['recurrent_kernel'] result, _ = LIB.haste_layer_norm_gru( inputs, weights['kernel'], recurrent_kernel, weights['bias'], weights['recurrent_bias'], weights['gamma'], zoneout_mask, training=training, zoneout_prob=self.zoneout) if sequence_length is not None: # 0-indexed tensors, so length-1. indices = sequence_length indices = tf.stack([indices, tf.range(batch_size, dtype=sequence_length.dtype)], axis=-1) state = tf.gather_nd(result, indices) else: state = result[-1] return result[1:], state class LayerNormGRU(BaseRNN): """ Layer Normalized Gated Recurrent Unit layer. This GRU layer applies layer normalization to the input and recurrent output activations of a standard GRU. The implementation is fused and GPU-accelerated. There are two commonly-used variants of GRU cells. This one implements 1406.1078v1 which applies the reset gate to the hidden state after matrix multiplication. The other variant, 1406.1078v3, applies the reset gate before matrix multiplication and is currently unsupported. This layer has built-in support for DropConnect and Zoneout, which are both techniques used to regularize RNNs. """ def __init__(self, num_units, direction='unidirectional', **kwargs): """ Initialize the parameters of the GRU layer. Arguments: num_units: int, the number of units in the GRU cell. direction: string, 'unidirectional' or 'bidirectional'. **kwargs: Dict, keyword arguments (see below). Keyword Arguments: kernel_initializer: (optional) the initializer to use for the input matrix weights. Defaults to `glorot_uniform`. recurrent_initializer: (optional) the initializer to use for the recurrent matrix weights. Defaults to `orthogonal`. bias_initializer: (optional) the initializer to use for input bias vectors. Defaults to `zeros`. recurrent_bias_initializer: (optional) the initializer to use for recurrent bias vectors. Defaults to `zeros`. kernel_transform: (optional) a function with signature `(kernel: Tensor) -> Tensor` that transforms the kernel before it is used. Defaults to the identity function. recurrent_transform: (optional) a function with signature `(recurrent_kernel: Tensor) -> Tensor` that transforms the recurrent kernel before it is used. Defaults to the identity function. bias_transform: (optional) a function with signature `(bias: Tensor) -> Tensor` that transforms the bias before it is used. Defaults to the identity function. recurrent_bias_transform: (optional) a function with signature `(recurrent_bias: Tensor) -> Tensor` that transforms the recurrent bias before it is used. Defaults to the identity function. dropout: (optional) float, sets the dropout rate for DropConnect regularization on the recurrent matrix. Defaults to 0. zoneout: (optional) float, sets the zoneout rate for Zoneout regularization. Defaults to 0. dtype: (optional) the data type for this layer. Defaults to `tf.float32`. name: (optional) string, the name for this layer. """ super().__init__(LayerNormGRULayer, num_units, direction, 'gru_cell', **kwargs) ================================================ FILE: frameworks/tf/layer_norm_gru_cell.py ================================================ # Copyright 2020 LMNT, Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== """A GRU cell compatible with the Haste LayerNormGRU layer.""" import tensorflow as tf from tensorflow.compat import v1 from tensorflow.compat.v1.nn import rnn_cell __all__ = [ 'LayerNormGRUCell' ] class LayerNormGRUCell(rnn_cell.RNNCell): """ A GRU cell that's compatible with the Haste LayerNormGRU layer. This cell can be used on hardware other than GPUs and with other TensorFlow classes that operate on RNN cells (e.g. `dynamic_rnn`, `BasicDecoder`, cell wrappers, etc.). """ def __init__(self, num_units, forget_bias=1.0, dropout=0.0, dtype=None, name=None, **kwargs): super(LayerNormGRUCell, self).__init__(dtype=dtype, name=name, **kwargs) self.realname = name self.num_units = num_units self.forget_bias = forget_bias self.dropout = dropout self.kernel = None self.recurrent_kernel = None self.bias = None self.recurrent_bias = None self.gamma = None self.built = False @property def state_size(self): return self.num_units @property def output_size(self): return self.num_units def build(self, shape): if self.built: return num_units = self.num_units input_size = int(shape[-1]) dtype = self.dtype or tf.float32 kernel_initializer = v1.initializers.glorot_uniform(dtype=dtype) bias_initializer = v1.initializers.zeros(dtype=dtype) with tf.name_scope(self.name), v1.variable_scope(self.realname, 'gru_cell'): self._kernel = v1.get_variable('kernel', initializer=lambda: kernel_initializer([input_size + num_units, num_units * 3])) self._bias = v1.get_variable('bias', initializer=lambda: bias_initializer([num_units * 6])) self.gamma = v1.get_variable('gamma', initializer=v1.initializers.ones()) self.kernel, self.recurrent_kernel = tf.split(self._kernel, [input_size, num_units], axis=0) self.bias, self.recurrent_bias = tf.split(self._bias, 2, axis=0) self.built = True def __call__(self, inputs, state, training=False, scope=None): self.build(inputs.shape) if training and self.dropout > 0: R = tf.nn.dropout(self.recurrent_kernel, rate=self.dropout) else: R = self.recurrent_kernel x = self._layer_norm(inputs @ self.kernel, self.gamma[0]) + self.bias h_proj = self._layer_norm(state @ R, self.gamma[1]) + self.recurrent_bias h_z, h_r, h_g = tf.split(h_proj, 3, axis=-1) x_z, x_r, x_g = tf.split(x, 3, axis=-1) z = tf.nn.sigmoid(h_z + x_z) r = tf.nn.sigmoid(h_r + x_r) g = tf.nn.tanh(r * h_g + x_g) h = z * state + (1 - z) * g return h, h def _layer_norm(self, x, gamma): mean, variance = tf.nn.moments(x, axes=[-1], keepdims=True) return tf.nn.batch_normalization(x, mean, variance, None, gamma, 1e-5) ================================================ FILE: frameworks/tf/layer_norm_indrnn.cc ================================================ // Copyright 2020 LMNT, Inc. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // ============================================================================== #include #include "arena.h" #include "haste.h" #include "support.h" #include "tensorflow/core/framework/op.h" #include "tensorflow/core/framework/op_kernel.h" #include "tensorflow/core/framework/shape_inference.h" #include "tensorflow/core/util/stream_executor_util.h" #include "tensorflow/stream_executor/stream.h" using namespace tensorflow; using tensorflow::se::Stream; using tensorflow::shape_inference::DimensionHandle; using tensorflow::shape_inference::InferenceContext; using tensorflow::shape_inference::ShapeHandle; namespace layer_norm = haste::v0::layer_norm; namespace layer_norm_indrnn = haste::v0::layer_norm_indrnn; // Define the interface and shape function for the op. REGISTER_OP("HasteLayerNormIndrnn") .Attr("R: {float, double}") // Some real number type. .Attr("training: bool") .Attr("zoneout_prob: float") .Input("x: R") // [T,N,C] .Input("kernel: R") // [C,H] .Input("recurrent_scale: R") // [H] .Input("bias: R") // [H] .Input("gamma: R") // [2,H] .Input("zoneout_mask: R") // [T,N,H] .Output("h: R") // [T+1,N,H] .Output("cache: R") .SetShapeFn([](InferenceContext* c) { ShapeHandle input_shape; ShapeHandle kernel_shape; ShapeHandle recurrent_shape; ShapeHandle bias_shape; ShapeHandle gamma_shape; ShapeHandle zoneout_mask_shape; TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 3, &input_shape)); TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 2, &kernel_shape)); TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 1, &recurrent_shape)); TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 1, &bias_shape)); TF_RETURN_IF_ERROR(c->WithRank(c->input(4), 2, &gamma_shape)); TF_RETURN_IF_ERROR(c->WithRank(c->input(5), 3, &zoneout_mask_shape)); const DimensionHandle time_steps = c->Dim(input_shape, 0); const DimensionHandle batch_size = c->Dim(input_shape, 1); const DimensionHandle hidden_size = c->Dim(recurrent_shape, 0); DimensionHandle time_steps_plus_1; TF_RETURN_IF_ERROR(c->Add(time_steps, 1, &time_steps_plus_1)); c->set_output(0, c->MakeShape({ time_steps_plus_1, batch_size, hidden_size })); c->set_output(1, c->UnknownShapeOfRank(1)); return Status::OK(); }); template struct HasteLayerNormIndrnnOp : public OpKernel { explicit HasteLayerNormIndrnnOp(OpKernelConstruction* context) : OpKernel(context) { OP_REQUIRES_OK(context, context->GetAttr("training", &training_)); OP_REQUIRES_OK(context, context->GetAttr("zoneout_prob", &zoneout_prob_)); } void Compute(OpKernelContext* context) override { const Tensor& input = context->input(0); const Tensor& kernel = context->input(1); const Tensor& recurrent_scale = context->input(2); const Tensor& bias = context->input(3); const Tensor& gamma = context->input(4); const Tensor& zoneout_mask = context->input(5); const auto time_steps = input.shape().dim_size(0); const auto batch_size = input.shape().dim_size(1); const auto input_size = input.shape().dim_size(2); const auto hidden_size = recurrent_scale.shape().dim_size(0); const bool has_zoneout = zoneout_prob_ && zoneout_mask.NumElements(); const auto data_type = DataTypeToEnum::value; const TensorShape output_shape = { time_steps + 1, batch_size, hidden_size }; Tensor* output = nullptr; OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, &output)); Tensor workspace; const TensorShape workspace_shape = { time_steps, batch_size, hidden_size }; OP_REQUIRES_OK(context, context->allocate_temp(data_type, workspace_shape, &workspace)); const ArenaLayout memory_layout = { { "act_Wx", { time_steps, batch_size, hidden_size } }, { "act_Wx_norm_cache", { time_steps, batch_size, 2 } }, }; Tensor* output_cache = nullptr; OP_REQUIRES_OK(context, context->allocate_output(1, { memory_layout.NumElements() }, &output_cache)); Arena memory = memory_layout.Realize(output_cache->flat().data()); TensorView act_Wx = memory["act_Wx"]; TensorView act_Wx_norm_cache = memory["act_Wx_norm_cache"]; cudaMemset(output->flat().data(), 0, output->AllocatedBytes()); layer_norm::ForwardPass layer_norm1( time_steps * batch_size, hidden_size, gamma.SubSlice(0).unaligned_flat().data(), nullptr, act_Wx_norm_cache.data()); layer_norm_indrnn::ForwardPass forward( training_, batch_size, input_size, hidden_size, GetCublasHandle(context)); forward.Run( time_steps, kernel.flat().data(), recurrent_scale.flat().data(), bias.flat().data(), input.flat().data(), output->flat().data(), workspace.flat().data(), act_Wx.data(), layer_norm1, has_zoneout ? zoneout_prob_ : 0.0f, has_zoneout ? zoneout_mask.flat().data() : nullptr); } private: bool training_; float zoneout_prob_; }; REGISTER_GPU_KERNEL(HasteLayerNormIndrnn, float); REGISTER_GPU_KERNEL(HasteLayerNormIndrnn, double); REGISTER_OP("HasteLayerNormIndrnnGrad") .Attr("R: {float, double}") .Input("x_t: R") // [C,N,T] .Input("kernel_t: R") // [4,C] .Input("recurrent_scale: R") // [H] .Input("bias: R") // [H] .Input("gamma: R") // [2,H] .Input("zoneout_mask: R") // [T,N,H] .Input("h: R") // [T+1,N,H] .Input("cache: R") // [?] .Input("dh_new: R") // [T+1,N,H] .Output("dx: R") // [T,N,C] .Output("dw: R") // [C,H] .Output("dr: R") // [H] .Output("db: R") // [H] .Output("dgamma: R") // [H] .SetShapeFn([](InferenceContext* c) { ShapeHandle x_shape; ShapeHandle kernel_shape; ShapeHandle recurrent_shape; ShapeHandle bias_shape; ShapeHandle gamma_shape; ShapeHandle zoneout_mask_shape; ShapeHandle h_shape; ShapeHandle cache_shape; ShapeHandle dh_new_shape; TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 3, &x_shape)); TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 2, &kernel_shape)); TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 1, &recurrent_shape)); TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 1, &bias_shape)); TF_RETURN_IF_ERROR(c->WithRank(c->input(4), 1, &gamma_shape)); TF_RETURN_IF_ERROR(c->WithRank(c->input(5), 3, &zoneout_mask_shape)); TF_RETURN_IF_ERROR(c->WithRank(c->input(6), 3, &h_shape)); TF_RETURN_IF_ERROR(c->WithRank(c->input(7), 1, &cache_shape)); TF_RETURN_IF_ERROR(c->WithRank(c->input(8), 3, &dh_new_shape)); DimensionHandle input_size = c->Dim(x_shape, 0); DimensionHandle time_steps = c->Dim(x_shape, 1); DimensionHandle batch_size = c->Dim(x_shape, 2); DimensionHandle hidden_size = c->Dim(recurrent_shape, 0); c->set_output(0, c->MakeShape({ time_steps, batch_size, input_size })); c->set_output(1, c->MakeShape({ input_size, hidden_size })); c->set_output(2, recurrent_shape); c->set_output(3, bias_shape); c->set_output(4, gamma_shape); return Status::OK(); }); template struct HasteLayerNormIndrnnGradOp : public OpKernel { explicit HasteLayerNormIndrnnGradOp(OpKernelConstruction* context) : OpKernel(context) {} void Compute(OpKernelContext* context) override { const Tensor& input = context->input(0); const Tensor& kernel = context->input(1); const Tensor& recurrent_scale = context->input(2); const Tensor& bias = context->input(3); const Tensor& gamma = context->input(4); const Tensor& zoneout_mask = context->input(5); const Tensor& h_vector = context->input(6); const Tensor& cache_input = context->input(7); const Tensor& dh_new = context->input(8); const auto input_size = input.shape().dim_size(0); const auto time_steps = input.shape().dim_size(1); const auto batch_size = input.shape().dim_size(2); const auto hidden_size = recurrent_scale.shape().dim_size(0); const bool has_zoneout = !!zoneout_mask.NumElements(); const auto data_type = DataTypeToEnum::value; const ArenaLayout memory_layout = { { "act_Wx", { time_steps, batch_size, hidden_size } }, { "act_Wx_norm_cache", { time_steps, batch_size, 2 } }, }; assert(cache_input.shape().num_elements() == memory_layout.NumElements()); Arena memory = memory_layout.Realize(const_cast(cache_input.flat().data())); TensorView act_Wx = memory["act_Wx"]; TensorView act_Wx_norm_cache = memory["act_Wx_norm_cache"]; // Can be uninitialized. Output only, no accumulation. const TensorShape dx_shape = { time_steps, batch_size, input_size }; Tensor* dx = nullptr; OP_REQUIRES_OK(context, context->allocate_output(0, dx_shape, &dx)); // Needs to be initialized to 0. const TensorShape dW_shape = { input_size, hidden_size }; Tensor* dW = nullptr; OP_REQUIRES_OK(context, context->allocate_output(1, dW_shape, &dW)); // Needs to be initialized to 0. const TensorShape du_shape = { hidden_size }; Tensor* du = nullptr; OP_REQUIRES_OK(context, context->allocate_output(2, du_shape, &du)); // Needs to be initialized to 0. Tensor* db = nullptr; OP_REQUIRES_OK(context, context->allocate_output(3, bias.shape(), &db)); // Needs to be initialized to 0. Tensor* dgamma = nullptr; OP_REQUIRES_OK(context, context->allocate_output(4, gamma.shape(), &dgamma)); // Needs to be initialized to 0. const TensorShape dh_shape = { batch_size, hidden_size }; Tensor dh; OP_REQUIRES_OK(context, context->allocate_temp(data_type, dh_shape, &dh)); const TensorShape workspace_shape = { time_steps, batch_size, hidden_size }; Tensor workspace; OP_REQUIRES_OK(context, context->allocate_temp(data_type, workspace_shape, &workspace)); cudaMemset(dW->flat().data(), 0, dW->AllocatedBytes()); cudaMemset(du->flat().data(), 0, du->AllocatedBytes()); cudaMemset(db->flat().data(), 0, db->AllocatedBytes()); cudaMemset(dgamma->flat().data(), 0, dgamma->AllocatedBytes()); cudaMemset(dh.flat().data(), 0, dh.AllocatedBytes()); layer_norm::BackwardPass layer_norm1( time_steps * batch_size, hidden_size, gamma.SubSlice(0).unaligned_flat().data(), nullptr, act_Wx.data(), dgamma->SubSlice(0).unaligned_flat().data(), nullptr, act_Wx_norm_cache.data()); layer_norm_indrnn::BackwardPass backward( batch_size, input_size, hidden_size, GetCublasHandle(context)); backward.Run( time_steps, kernel.flat().data(), recurrent_scale.flat().data(), bias.flat().data(), input.flat().data(), h_vector.flat().data(), dh_new.flat().data(), dx->flat().data(), dW->flat().data(), du->flat().data(), db->flat().data(), dh.flat().data(), workspace.flat().data(), layer_norm1, has_zoneout ? zoneout_mask.flat().data() : nullptr); } }; REGISTER_GPU_KERNEL(HasteLayerNormIndrnnGrad, float); REGISTER_GPU_KERNEL(HasteLayerNormIndrnnGrad, double); ================================================ FILE: frameworks/tf/layer_norm_indrnn.py ================================================ # Copyright 2020 LMNT, Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== """Layer Normalized Independently Recurrent Neural Network""" import pkg_resources import tensorflow as tf from tensorflow.compat import v1 from tensorflow.compat.v1.nn import rnn_cell from .base_rnn import BaseRNN from .weight_config import WeightConfig __all__ = [ 'LayerNormIndRNN' ] LIB = tf.load_op_library(pkg_resources.resource_filename(__name__, 'libhaste_tf.so')) @tf.RegisterGradient("HasteLayerNormIndrnn") def layer_norm_indrnn_gradient(op, *grads): training = op.get_attr('training') if not training: raise ValueError(('LayerNormIndRNN can only compute gradients if `training=True` was specified ' 'during the forward pass.\nFailed op: {}').format(op.name)) # Extract inputs and outputs from the op. x = op.inputs[0] W = op.inputs[1] u = op.inputs[2] b = op.inputs[3] gamma = op.inputs[4] zoneout_mask = op.inputs[5] h = op.outputs[0] cache = op.outputs[1] # Pre-transpose matrices for better performance. x = tf.transpose(x, [2, 0, 1]) W = tf.transpose(W, [1, 0]) grads = LIB.haste_layer_norm_indrnn_grad(x, W, u, b, gamma, zoneout_mask, h, cache, grads[0]) return [*grads, None] def _get_initializer(initializer): if not isinstance(initializer, dict): return initializer if 'uniform' in initializer: value = initializer['uniform'] return v1.initializers.random_uniform(-value, value) if 'normal' in initializer: value = initializer['normal'] return v1.initializers.truncated_normal(stddev=value) raise ValueError(f'Unknown initializer {initializer}') class LayerNormIndRNNLayer(tf.Module): def __init__(self, num_units, kernel_initializer=None, recurrent_initializer=None, bias_initializer=None, kernel_transform=None, recurrent_transform=None, bias_transform=None, zoneout=0.0, dtype=None, name=None): super().__init__(name) self.realname = name self.num_units = num_units identity = lambda x: x self.kernel_config = WeightConfig(v1.initializers.glorot_uniform(), None, identity) self.recurrent_config = WeightConfig(v1.initializers.random_uniform(-0.5, 0.5), None, identity) self.bias_config = WeightConfig(v1.initializers.zeros(), None, identity) self.kernel_config.override(_get_initializer(kernel_initializer), None, kernel_transform) self.recurrent_config.override(_get_initializer(recurrent_initializer), None, recurrent_transform) self.bias_config.override(_get_initializer(bias_initializer), None, bias_transform) self.zoneout = zoneout self.dtype = dtype or tf.float32 self.kernel = None self.recurrent_scale = None self.bias = None self.gamma = None self.recurrent_bias = None self.built = False def build(self, shape): if self.built: return num_units = self.num_units input_size = int(shape[-1]) kernel_shape = tf.TensorShape([input_size, num_units]) recurrent_shape = tf.TensorShape([num_units]) bias_shape = tf.TensorShape([num_units]) kernel_weights = self.kernel_config.initializer(kernel_shape, dtype=self.dtype) recurrent_weights = self.recurrent_config.initializer(recurrent_shape, dtype=self.dtype) biases = self.bias_config.initializer(bias_shape) with self.name_scope, v1.variable_scope(self.realname, 'indrnn_cell'): self.kernel = v1.get_variable('kernel', initializer=kernel_weights) self.recurrent_scale = v1.get_variable('recurrent_scale', initializer=recurrent_weights) self.bias = v1.get_variable('bias', initializer=biases) self.gamma = v1.get_variable('gamma', shape=[2, self.num_units], initializer=v1.initializers.ones()) self.built = True def get_weights(self): return { 'kernel': self.kernel_config.transform(self.kernel), 'recurrent_scale': self.recurrent_config.transform(self.recurrent_scale), 'bias': self.bias_config.transform(self.bias), 'gamma': self.gamma, } def __call__(self, inputs, sequence_length, training): self.build(inputs.shape) shape = tf.shape(inputs) time_steps = shape[0] batch_size = shape[1] # Use an empty zoneout mask if no zoneout is going to be applied. # Sadly, we can't pass `None` to the op but at least we won't be wasting # memory or bandwidth on this tensor. zoneout_mask = tf.zeros([0, 0, 0], dtype=self.dtype) if self.zoneout: zoneout_mask = 1.0 - self.zoneout zoneout_mask += tf.random.uniform([time_steps, batch_size, self.num_units], dtype=self.dtype) zoneout_mask = tf.floor(zoneout_mask) weights = self.get_weights() result, _ = LIB.haste_layer_norm_indrnn( inputs, weights['kernel'], weights['recurrent_scale'], weights['bias'], weights['gamma'], zoneout_mask, training=training, zoneout_prob=self.zoneout) if sequence_length is not None: # 0-indexed tensors, so length-1. indices = sequence_length indices = tf.stack([indices, tf.range(batch_size, dtype=sequence_length.dtype)], axis=-1) state = tf.gather_nd(result, indices) else: state = result[-1] return result[1:], state class LayerNormIndRNN(BaseRNN): """ Layer Normalized Independently Recurrent Neural Network layer. This IndRNN layer applies layer normalization to the input activations of a standard IndRNN. The implementation is fused and GPU-accelerated. This layer has built-in support for Zoneout regularization. """ def __init__(self, num_units, direction='unidirectional', **kwargs): """ Initialize the parameters of the IndRNN layer. Arguments: num_units: int, the number of units in the IndRNN cell. direction: string, 'unidirectional' or 'bidirectional'. **kwargs: Dict, keyword arguments (see below). Keyword Arguments: kernel_initializer: (optional) the initializer to use for the input matrix weights. Defaults to `glorot_uniform`. recurrent_initializer: (optional) the initializer to use for the recurrent scale weights. Defaults to uniform random in [-0.5, 0.5]. Note that this initialization scheme is different than in the original authors' implementation. See https://github.com/lmnt-com/haste/issues/7 for details. bias_initializer: (optional) the initializer to use for the bias vector. Defaults to `zeros`. kernel_transform: (optional) a function with signature `(kernel: Tensor) -> Tensor` that transforms the kernel before it is used. Defaults to the identity function. recurrent_transform: (optional) a function with signature `(recurrent_scale: Tensor) -> Tensor` that transforms the recurrent scale vector before it is used. Defaults to the identity function. bias_transform: (optional) a function with signature `(bias: Tensor) -> Tensor` that transforms the bias before it is used. Defaults to the identity function. zoneout: (optional) float, sets the zoneout rate for Zoneout regularization. Defaults to 0. dtype: (optional) the data type for this layer. Defaults to `tf.float32`. name: (optional) string, the name for this layer. """ super().__init__(LayerNormIndRNNLayer, num_units, direction, 'indrnn_cell', **kwargs) ================================================ FILE: frameworks/tf/layer_norm_lstm.cc ================================================ // Copyright 2020 LMNT, Inc. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // ============================================================================== #include #include "arena.h" #include "haste.h" #include "support.h" #include "tensorflow/core/framework/op.h" #include "tensorflow/core/framework/op_kernel.h" #include "tensorflow/core/framework/shape_inference.h" #include "tensorflow/core/util/stream_executor_util.h" #include "tensorflow/stream_executor/stream.h" using namespace tensorflow; using tensorflow::se::Stream; using tensorflow::shape_inference::DimensionHandle; using tensorflow::shape_inference::InferenceContext; using tensorflow::shape_inference::ShapeHandle; namespace layer_norm = haste::v0::layer_norm; namespace layer_norm_lstm = haste::v0::layer_norm_lstm; // Define the interface and shape function for the op. REGISTER_OP("HasteLayerNormLstm") .Attr("R: {float, double}") // Some real number type. .Attr("training: bool") .Attr("zoneout_prob: float") .Input("x: R") // [T,N,C] .Input("kernel: R") // [C,H*4] .Input("recurrent_kernel: R") // [H,H*4] .Input("bias: R") // [H*4] .Input("gamma: R") .Input("gamma_h: R") .Input("beta_h: R") .Input("zoneout_mask: R") // [T,N,H] .Output("h: R") // [T,N,H] .Output("c: R") // [T,N,H] .Output("cache: R") // [?] (activations cache) .SetShapeFn([](InferenceContext* c) { ShapeHandle input_shape; ShapeHandle kernel_shape; ShapeHandle recurrent_shape; ShapeHandle bias_shape; ShapeHandle gamma_shape; ShapeHandle gamma_h_shape; ShapeHandle beta_h_shape; ShapeHandle zoneout_mask_shape; TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 3, &input_shape)); TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 2, &kernel_shape)); TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 2, &recurrent_shape)); TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 1, &bias_shape)); TF_RETURN_IF_ERROR(c->WithRank(c->input(4), 2, &gamma_shape)); TF_RETURN_IF_ERROR(c->WithRank(c->input(5), 1, &gamma_h_shape)); TF_RETURN_IF_ERROR(c->WithRank(c->input(6), 1, &beta_h_shape)); TF_RETURN_IF_ERROR(c->WithRank(c->input(7), 3, &zoneout_mask_shape)); const DimensionHandle time_steps = c->Dim(input_shape, 0); const DimensionHandle batch_size = c->Dim(input_shape, 1); const DimensionHandle hidden_size = c->Dim(recurrent_shape, 0); DimensionHandle time_steps_plus_1; TF_RETURN_IF_ERROR(c->Add(time_steps, 1, &time_steps_plus_1)); c->set_output(0, c->MakeShape({ time_steps_plus_1, batch_size, hidden_size })); c->set_output(1, c->MakeShape({ time_steps_plus_1, batch_size, hidden_size })); c->set_output(2, c->UnknownShapeOfRank(1)); return Status::OK(); }); template struct HasteLayerNormLstmOp : public OpKernel { explicit HasteLayerNormLstmOp(OpKernelConstruction* context) : OpKernel(context) { OP_REQUIRES_OK(context, context->GetAttr("training", &training_)); OP_REQUIRES_OK(context, context->GetAttr("zoneout_prob", &zoneout_prob_)); } // When running on GPU, TF backs all inputs and outputs with device memory // and not host memory. We don't need to do explicit memory copies or allocations // for the inputs and outputs. void Compute(OpKernelContext* context) override { const Tensor& input = context->input(0); const Tensor& kernel = context->input(1); const Tensor& recurrent_kernel = context->input(2); const Tensor& bias = context->input(3); const Tensor& gamma = context->input(4); const Tensor& gamma_h = context->input(5); const Tensor& beta_h = context->input(6); const Tensor& zoneout_mask = context->input(7); const auto time_steps = input.shape().dim_size(0); const auto batch_size = input.shape().dim_size(1); const auto input_size = input.shape().dim_size(2); const auto hidden_size = recurrent_kernel.shape().dim_size(0); const bool has_zoneout = zoneout_prob_ && zoneout_mask.NumElements(); const auto data_type = DataTypeToEnum::value; OP_REQUIRES(context, input_size == kernel.shape().dim_size(0), errors::InvalidArgument("input[2] and kernel[0] dimensions must match. Found ", input_size, " and ", kernel.shape().dim_size(0))); const TensorShape output_shape = { time_steps + 1, batch_size, hidden_size }; const TensorShape activations_shape = { time_steps, batch_size, hidden_size * 4 }; const TensorShape norm_cache_shape = { time_steps, batch_size, 2 }; Tensor* output = nullptr; OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, &output)); Tensor* output_cell_state = nullptr; OP_REQUIRES_OK(context, context->allocate_output(1, output_shape, &output_cell_state)); const ArenaLayout memory_layout = { { "act_Wx", activations_shape }, { "act_Wx_norm", activations_shape }, { "act_Wx_norm_cache", norm_cache_shape }, { "act_Rh", activations_shape }, { "act_Rh_norm_cache", norm_cache_shape }, { "act_c_norm", { time_steps, batch_size, hidden_size } }, { "act_c_norm_cache", norm_cache_shape }, }; Tensor* output_cache = nullptr; OP_REQUIRES_OK(context, context->allocate_output(2, { memory_layout.NumElements() }, &output_cache)); Arena memory = memory_layout.Realize(output_cache->flat().data()); TensorView act_Wx = memory["act_Wx"]; TensorView act_Wx_norm = memory["act_Wx_norm"]; TensorView act_Wx_norm_cache = memory["act_Wx_norm_cache"]; TensorView act_Rh = memory["act_Rh"]; TensorView act_Rh_norm_cache = memory["act_Rh_norm_cache"]; TensorView act_c_norm = memory["act_c_norm"]; TensorView act_c_norm_cache = memory["act_c_norm_cache"]; Tensor tmp_Rh; const TensorShape tmp_Rh_shape = { batch_size, 4 * hidden_size }; OP_REQUIRES_OK(context, context->allocate_temp(data_type, tmp_Rh_shape, &tmp_Rh)); cudaMemset(output->flat().data(), 0, output->AllocatedBytes()); cudaMemset(output_cell_state->flat().data(), 0, output_cell_state->AllocatedBytes()); layer_norm::ForwardPass layer_norm1( time_steps * batch_size, hidden_size * 4, gamma.SubSlice(0).unaligned_flat().data(), nullptr, act_Wx_norm_cache.data()); layer_norm::ForwardPass layer_norm2( time_steps * batch_size, hidden_size * 4, gamma.SubSlice(1).unaligned_flat().data(), nullptr, act_Rh_norm_cache.data()); layer_norm::ForwardPass layer_norm3( time_steps * batch_size, hidden_size, gamma_h.flat().data(), beta_h.flat().data(), act_c_norm_cache.data()); layer_norm_lstm::ForwardPass lstm( training_, batch_size, input_size, hidden_size, GetCublasHandle(context)); lstm.Run( time_steps, kernel.flat().data(), recurrent_kernel.flat().data(), bias.flat().data(), input.flat().data(), output->flat().data(), output_cell_state->flat().data(), act_Wx.data(), tmp_Rh.flat().data(), layer_norm1, act_Wx_norm.data(), act_Rh.data(), layer_norm2, layer_norm3, act_c_norm.data(), has_zoneout ? zoneout_prob_ : 0.0f, has_zoneout ? zoneout_mask.flat().data() : nullptr); } private: bool training_; float zoneout_prob_; }; REGISTER_GPU_KERNEL(HasteLayerNormLstm, float); REGISTER_GPU_KERNEL(HasteLayerNormLstm, double); REGISTER_OP("HasteLayerNormLstmGrad") .Attr("R: {float, double}") .Input("x_t: R") // [C,N,T] .Input("kernel_t: R") // [H*4,C] .Input("recurrent_kernel_t: R") // [H*4,H] .Input("bias: R") // [H*4] .Input("gamma: R") .Input("gamma_h: R") .Input("beta_h: R") .Input("h: R") // [T,N,H] .Input("c: R") // [T,N,H] .Input("cache: R") .Input("dh_new: R") // [T,N,H] .Input("dc_new: R") // [T,N,H] .Input("zoneout_mask: R") // [T,N,H] .Output("dx: R") // [T,N,C] .Output("dw: R") // [C,H*4] .Output("dr: R") // [H,H*4] .Output("db: R") // [H*4] .Output("dgamma: R") .Output("dgamma_h: R") .Output("dbeta_h: R") .SetShapeFn([](InferenceContext* c) { ShapeHandle x_shape; ShapeHandle kernel_shape; ShapeHandle recurrent_kernel_shape; ShapeHandle bias_shape; ShapeHandle gamma_shape; ShapeHandle gamma_h_shape; ShapeHandle beta_h_shape; ShapeHandle h_shape; ShapeHandle c_shape; ShapeHandle cache_shape; ShapeHandle dh_new_shape; ShapeHandle dc_new_shape; ShapeHandle zoneout_mask_shape; TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 3, &x_shape)); TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 2, &kernel_shape)); TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 2, &recurrent_kernel_shape)); TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 1, &bias_shape)); TF_RETURN_IF_ERROR(c->WithRank(c->input(4), 2, &gamma_shape)); TF_RETURN_IF_ERROR(c->WithRank(c->input(5), 1, &gamma_h_shape)); TF_RETURN_IF_ERROR(c->WithRank(c->input(6), 1, &beta_h_shape)); TF_RETURN_IF_ERROR(c->WithRank(c->input(7), 3, &h_shape)); TF_RETURN_IF_ERROR(c->WithRank(c->input(8), 3, &c_shape)); TF_RETURN_IF_ERROR(c->WithRank(c->input(9), 1, &cache_shape)); TF_RETURN_IF_ERROR(c->WithRank(c->input(10), 3, &dh_new_shape)); TF_RETURN_IF_ERROR(c->WithRank(c->input(11), 3, &dc_new_shape)); TF_RETURN_IF_ERROR(c->WithRank(c->input(12), 3, &zoneout_mask_shape)); DimensionHandle input_size = c->Dim(x_shape, 0); DimensionHandle time_steps = c->Dim(x_shape, 1); DimensionHandle batch_size = c->Dim(x_shape, 2); DimensionHandle hidden_size = c->Dim(recurrent_kernel_shape, 1); DimensionHandle hidden_size_4; TF_RETURN_IF_ERROR(c->Multiply(hidden_size, 4, &hidden_size_4)); c->set_output(0, c->MakeShape({ time_steps, batch_size, input_size })); c->set_output(1, c->MakeShape({ input_size, hidden_size_4 })); c->set_output(2, c->MakeShape({ hidden_size, hidden_size_4 })); c->set_output(3, bias_shape); c->set_output(4, gamma_shape); c->set_output(5, gamma_h_shape); c->set_output(6, beta_h_shape); return Status::OK(); }); template struct HasteLayerNormLstmGradOp : public OpKernel { explicit HasteLayerNormLstmGradOp(OpKernelConstruction* context) : OpKernel(context) {} void Compute(OpKernelContext* context) override { const Tensor& input = context->input(0); const Tensor& kernel = context->input(1); const Tensor& recurrent_kernel = context->input(2); const Tensor& bias = context->input(3); const Tensor& gamma = context->input(4); const Tensor& gamma_h = context->input(5); const Tensor& beta_h = context->input(6); const Tensor& h_vector = context->input(7); const Tensor& c_vector = context->input(8); const Tensor& cache_input = context->input(9); const Tensor& dh_new = context->input(10); const Tensor& dc_new = context->input(11); const Tensor& zoneout_mask = context->input(12); const auto input_size = input.shape().dim_size(0); const auto time_steps = input.shape().dim_size(1); const auto batch_size = input.shape().dim_size(2); const auto hidden_size = recurrent_kernel.shape().dim_size(1); const bool has_zoneout = !!zoneout_mask.NumElements(); const auto data_type = DataTypeToEnum::value; // Can be uninitialized. Output only, no accumulation. const TensorShape dx_shape = { time_steps, batch_size, input_size }; Tensor* dx = nullptr; OP_REQUIRES_OK(context, context->allocate_output(0, dx_shape, &dx)); // Needs to be initialized to 0. const TensorShape dW_shape = { input_size, hidden_size * 4 }; Tensor* dW = nullptr; OP_REQUIRES_OK(context, context->allocate_output(1, dW_shape, &dW)); // Needs to be initialized to 0. const TensorShape dR_shape = { hidden_size, hidden_size * 4 }; Tensor* dR = nullptr; OP_REQUIRES_OK(context, context->allocate_output(2, dR_shape, &dR)); // Needs to be initialized to 0. const TensorShape db_shape = { hidden_size * 4 }; Tensor* db = nullptr; OP_REQUIRES_OK(context, context->allocate_output(3, db_shape, &db)); // Needs to be initialized to 0. Tensor* dgamma = nullptr; OP_REQUIRES_OK(context, context->allocate_output(4, gamma.shape(), &dgamma)); // Needs to be initialized to 0. Tensor* dgamma_h = nullptr; OP_REQUIRES_OK(context, context->allocate_output(5, gamma_h.shape(), &dgamma_h)); // Needs to be initialized to 0. Tensor* dbeta_h = nullptr; OP_REQUIRES_OK(context, context->allocate_output(6, beta_h.shape(), &dbeta_h)); // Needs to be initialized to 0. const TensorShape dh_shape = { batch_size, hidden_size }; Tensor dh; OP_REQUIRES_OK(context, context->allocate_temp(data_type, dh_shape, &dh)); // Needs to be initialized to 0. const TensorShape dc_shape = { batch_size, hidden_size }; Tensor dc; OP_REQUIRES_OK(context, context->allocate_temp(data_type, dc_shape, &dc)); const TensorShape activations_shape = { time_steps, batch_size, hidden_size * 4 }; const TensorShape norm_cache_shape = { time_steps, batch_size, 2 }; const ArenaLayout memory_layout = { { "act_Wx", activations_shape }, { "act_Wx_norm", activations_shape }, { "act_Wx_norm_cache", norm_cache_shape }, { "act_Rh", activations_shape }, { "act_Rh_norm_cache", norm_cache_shape }, { "act_c_norm", { time_steps, batch_size, hidden_size } }, { "act_c_norm_cache", norm_cache_shape }, }; assert(cache_input.shape().num_elements() == memory_layout.NumElements()); Arena memory = memory_layout.Realize(const_cast(cache_input.flat().data())); TensorView act_Wx = memory["act_Wx"]; TensorView act_Wx_norm = memory["act_Wx_norm"]; TensorView act_Wx_norm_cache = memory["act_Wx_norm_cache"]; TensorView act_Rh = memory["act_Rh"]; TensorView act_Rh_norm_cache = memory["act_Rh_norm_cache"]; TensorView act_c_norm = memory["act_c_norm"]; TensorView act_c_norm_cache = memory["act_c_norm_cache"]; cudaMemset(dW->flat().data(), 0, dW->AllocatedBytes()); cudaMemset(dR->flat().data(), 0, dR->AllocatedBytes()); cudaMemset(db->flat().data(), 0, db->AllocatedBytes()); cudaMemset(dgamma->flat().data(), 0, dgamma->AllocatedBytes()); cudaMemset(dgamma_h->flat().data(), 0, dgamma_h->AllocatedBytes()); cudaMemset(dbeta_h->flat().data(), 0, dbeta_h->AllocatedBytes()); cudaMemset(dh.flat().data(), 0, dh.AllocatedBytes()); cudaMemset(dc.flat().data(), 0, dc.AllocatedBytes()); layer_norm::BackwardPass layer_norm1( time_steps * batch_size, hidden_size * 4, gamma.SubSlice(0).unaligned_flat().data(), nullptr, act_Wx.data(), dgamma->SubSlice(0).unaligned_flat().data(), nullptr, act_Wx_norm_cache.data()); layer_norm::BackwardPass layer_norm2( time_steps * batch_size, hidden_size * 4, gamma.SubSlice(1).unaligned_flat().data(), nullptr, act_Rh.data(), dgamma->SubSlice(1).unaligned_flat().data(), nullptr, act_Rh_norm_cache.data()); layer_norm::BackwardPass layer_norm3( time_steps * batch_size, hidden_size, gamma_h.flat().data(), beta_h.flat().data(), c_vector.SubSlice(1).unaligned_flat().data(), dgamma_h->flat().data(), dbeta_h->flat().data(), act_c_norm_cache.data()); layer_norm_lstm::BackwardPass lstm( batch_size, input_size, hidden_size, GetCublasHandle(context)); lstm.Run( time_steps, kernel.flat().data(), recurrent_kernel.flat().data(), bias.flat().data(), input.flat().data(), h_vector.flat().data(), c_vector.flat().data(), dh_new.flat().data(), dc_new.flat().data(), dx->flat().data(), dW->flat().data(), dR->flat().data(), db->flat().data(), dh.flat().data(), dc.flat().data(), act_Wx.data(), layer_norm1, act_Wx_norm.data(), act_Rh.data(), layer_norm2, layer_norm3, act_c_norm.data(), has_zoneout ? zoneout_mask.flat().data() : nullptr); } }; REGISTER_GPU_KERNEL(HasteLayerNormLstmGrad, float); REGISTER_GPU_KERNEL(HasteLayerNormLstmGrad, double); ================================================ FILE: frameworks/tf/layer_norm_lstm.py ================================================ # Copyright 2020 LMNT, Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== """Layer Normalized Long Short-Term Memory""" import pkg_resources import tensorflow as tf from tensorflow.compat import v1 from tensorflow.compat.v1.nn import rnn_cell from .base_rnn import BaseRNN from .weight_config import WeightConfig __all__ = [ 'LayerNormLSTM' ] LIB = tf.load_op_library(pkg_resources.resource_filename(__name__, 'libhaste_tf.so')) @tf.RegisterGradient("HasteLayerNormLstm") def lstm_gradient(op, *grads): training = op.get_attr('training') if not training: raise ValueError(('LSTM can only compute gradients if `training=True` was specified during the ' 'forward pass.\nFailed op: {}').format(op.name)) # Extract inputs and outputs from the op. x = op.inputs[0] W = op.inputs[1] R = op.inputs[2] b = op.inputs[3] gamma = op.inputs[4] gamma_h = op.inputs[5] beta_h = op.inputs[6] zoneout_mask = op.inputs[7] h = op.outputs[0] c = op.outputs[1] cache = op.outputs[2] # Pre-transpose matrices for better performance. x = tf.transpose(x, [2, 0, 1]) W = tf.transpose(W, [1, 0]) R = tf.transpose(R, [1, 0]) dx, dW, dR, db, dgamma, dgamma_h, dbeta_h = LIB.haste_layer_norm_lstm_grad( x, W, R, b, gamma, gamma_h, beta_h, h, c, cache, grads[0], grads[1], zoneout_mask) return [dx, dW, dR, db, dgamma, dgamma_h, dbeta_h, None] class LayerNormLSTMLayer(tf.Module): def __init__(self, num_units, kernel_initializer=None, recurrent_initializer=None, bias_initializer=None, kernel_transform=None, recurrent_transform=None, bias_transform=None, forget_bias=1.0, dropout=0.0, zoneout=0.0, dtype=None, name=None): super(LayerNormLSTMLayer, self).__init__(name) self.realname = name self.num_units = num_units identity = lambda x: x self.kernel_config = WeightConfig(v1.initializers.glorot_uniform(), None, identity) self.recurrent_config = WeightConfig(v1.initializers.orthogonal(), None, identity) self.bias_config = WeightConfig(v1.initializers.zeros(), None, identity) self.kernel_config.override(kernel_initializer, None, kernel_transform) self.recurrent_config.override(recurrent_initializer, None, recurrent_transform) self.bias_config.override(bias_initializer, None, bias_transform) self.forget_bias = forget_bias self.dropout = dropout self.zoneout = zoneout self.dtype = dtype or tf.float32 self.kernel = None self.bias = None self.gamma = None self.gamma_h = None self.beta_h = None self.built = False def build(self, shape): if self.built: return num_units = self.num_units input_size = int(shape[-1]) kernel_shape = tf.TensorShape([input_size, num_units]) recurrent_shape = tf.TensorShape([num_units, num_units]) bias_shape = tf.TensorShape([num_units]) kernel_weights = [self.kernel_config.initializer(kernel_shape, dtype=self.dtype) for _ in range(4)] recurrent_weights = [self.recurrent_config.initializer(recurrent_shape, dtype=self.dtype) for _ in range(4)] if self.forget_bias: biases = [tf.zeros(bias_shape, dtype=self.dtype) for _ in range(4)] biases[2] = tf.constant(self.forget_bias, shape=bias_shape, dtype=self.dtype) else: biases = [self.bias_config.initializer(bias_shape, dtype=self.dtype) for _ in range(4)] kernel_weights = tf.concat(kernel_weights, axis=-1) recurrent_weights = tf.concat(recurrent_weights, axis=-1) biases = tf.concat(biases, axis=-1) # Use the same format as LSTMBlockCell. with self.name_scope, v1.variable_scope(self.realname, 'lstm_cell'): weights = tf.concat([kernel_weights, recurrent_weights], axis=0) self.kernel = v1.get_variable('kernel', initializer=weights) self.bias = v1.get_variable('bias', initializer=biases) self.gamma = v1.get_variable('gamma', shape=[2, self.num_units * 4], initializer=v1.initializers.ones()) self.gamma_h = v1.get_variable('gamma_h', shape=[self.num_units], initializer=v1.initializers.ones()) self.beta_h = v1.get_variable('beta_h', shape=[self.num_units], initializer=v1.initializers.zeros()) self.built = True def get_weights(self): kernel = self.kernel[:-self.num_units] recurrent_kernel = self.kernel[-self.num_units:] return { 'kernel': self.kernel_config.transform(kernel), 'recurrent_kernel': self.recurrent_config.transform(recurrent_kernel), 'bias': self.bias_config.transform(self.bias), 'gamma': self.gamma, 'gamma_h': self.gamma_h, 'beta_h': self.beta_h, } @property def state_size(self): return rnn_cell.LSTMStateTuple(self.num_units, self.num_units) @property def output_size(self): return self.num_units def __call__(self, x, sequence_length, training): self.build(x.shape) shape = tf.shape(x) time_steps = shape[0] batch_size = shape[1] # Use an empty zoneout mask if no zoneout is going to be applied. # Sadly, we can't pass `None` to the op but at least we won't be wasting # memory or bandwidth on this tensor. zoneout_mask = tf.zeros([0, 0, 0], dtype=self.dtype) if self.zoneout: zoneout_mask = 1.0 - self.zoneout zoneout_mask += tf.random.uniform([time_steps, batch_size, self.num_units], dtype=self.dtype) zoneout_mask = tf.floor(zoneout_mask) weights = self.get_weights() if training and self.dropout > 0: recurrent_kernel = tf.nn.dropout(weights['recurrent_kernel'], rate=self.dropout) else: recurrent_kernel = weights['recurrent_kernel'] h, c, _ = LIB.haste_layer_norm_lstm( x, weights['kernel'], recurrent_kernel, weights['bias'], weights['gamma'], weights['gamma_h'], weights['beta_h'], zoneout_mask, training=training, zoneout_prob=self.zoneout) if sequence_length is not None: indices = sequence_length indices = tf.stack([indices, tf.range(batch_size, dtype=sequence_length.dtype)], axis=-1) state = rnn_cell.LSTMStateTuple(tf.gather_nd(c, indices), tf.gather_nd(h, indices)) else: state = rnn_cell.LSTMStateTuple(c[-1], h[-1]) return h[1:], state class LayerNormLSTM(BaseRNN): """ Layer Normalized Long Short-Term Memory layer. This LSTM layer applies layer normalization to the input, recurrent, and output activations of a standard LSTM. The implementation is fused and GPU-accelerated. DropConnect and Zoneout regularization are built-in, and this layer allows setting a non-zero initial forget gate bias. Details about the exact function this layer implements can be found at https://github.com/lmnt-com/haste/issues/1. """ def __init__(self, num_units, direction='unidirectional', **kwargs): """ Initialize the parameters of the LSTM layer. Arguments: num_units: int, the number of units in the LSTM cell. direction: string, 'unidirectional' or 'bidirectional'. **kwargs: Dict, keyword arguments (see below). Keyword Arguments: kernel_initializer: (optional) the initializer to use for the input matrix weights. Defaults to `glorot_uniform`. recurrent_initializer: (optional) the initializer to use for the recurrent matrix weights. Defaults to `orthogonal`. bias_initializer: (optional) the initializer to use for both input and recurrent bias vectors. Defaults to `zeros` unless `forget_bias` is non-zero (see below). kernel_transform: (optional) a function with signature `(kernel: Tensor) -> Tensor` that transforms the kernel before it is used. Defaults to the identity function. recurrent_transform: (optional) a function with signature `(recurrent_kernel: Tensor) -> Tensor` that transforms the recurrent kernel before it is used. Defaults to the identity function. bias_transform: (optional) a function with signature `(bias: Tensor) -> Tensor` that transforms the bias before it is used. Defaults to the identity function. forget_bias: (optional) float, sets the initial weights for the forget gates. Defaults to 1 and overrides the `bias_initializer` unless this argument is set to 0. dropout: (optional) float, sets the dropout rate for DropConnect regularization on the recurrent matrix. Defaults to 0. zoneout: (optional) float, sets the zoneout rate for Zoneout regularization. Defaults to 0. dtype: (optional) the data type for this layer. Defaults to `tf.float32`. name: (optional) string, the name for this layer. """ super().__init__(LayerNormLSTMLayer, num_units, direction, 'lstm_cell', **kwargs) ================================================ FILE: frameworks/tf/layer_norm_lstm_cell.py ================================================ # Copyright 2020 LMNT, Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== """An LSTM cell compatible with the Haste LayerNormLSTM layer.""" import tensorflow as tf from tensorflow.compat import v1 from tensorflow.compat.v1.nn import rnn_cell __all__ = [ 'LayerNormLSTMCell' ] class LayerNormLSTMCell(rnn_cell.RNNCell): """ An LSTM cell that's compatible with the Haste LayerNormLSTM layer. This cell can be used on hardware other than GPUs and with other TensorFlow classes that operate on RNN cells (e.g. `dynamic_rnn`, `BasicDecoder`, cell wrappers, etc.). """ def __init__(self, num_units, forget_bias=1.0, dropout=0.0, dtype=None, name=None, **kwargs): super(LayerNormLSTMCell, self).__init__(dtype=dtype, name=name, **kwargs) self.realname = name self.num_units = num_units self.forget_bias = forget_bias self.dropout = dropout self.kernel = None self.recurrent_kernel = None self.bias = None self.gamma = None self.gamma_h = None self.beta_h = None self.built = False @property def state_size(self): return rnn_cell.LSTMStateTuple(self.num_units, self.num_units) @property def output_size(self): return self.num_units def build(self, shape): num_units = self.num_units input_size = int(shape[-1]) # No user-supplied initializers here since this class should only really # be used for inference on a pre-trained model. with tf.name_scope(self.name), v1.variable_scope(self.realname, 'lstm_cell'): self._kernel = v1.get_variable('kernel', shape=[input_size + num_units, num_units * 4]) self.kernel, self.recurrent_kernel = tf.split(self._kernel, [input_size, num_units], axis=0) self.bias = v1.get_variable('bias', shape=[num_units * 4], initializer=v1.initializers.zeros()) self.gamma = v1.get_variable('gamma', shape=[2, num_units * 4], initializer=v1.initializers.ones()) self.gamma_h = v1.get_variable('gamma_h', shape=[num_units], initializer=v1.initializers.ones()) self.beta_h = v1.get_variable('beta_h', shape=[num_units], initializer=v1.initializers.zeros()) self.null = tf.zeros_like(self.gamma[0]) self.built = True def __call__(self, inputs, state, training=False, scope=None): self.build(inputs.shape) if training and self.dropout > 0: R = tf.nn.dropout(self.recurrent_kernel, rate=self.dropout) else: R = self.recurrent_kernel Wx = self._layer_norm(tf.matmul(inputs, self.kernel), self.gamma[0], self.null) Rh = self._layer_norm(tf.matmul(state.h, R), self.gamma[1], self.null) v = Wx + Rh + self.bias v_i, v_g, v_f, v_o = tf.split(v, 4, axis=-1) i = tf.nn.sigmoid(v_i) g = tf.nn.tanh (v_g) f = tf.nn.sigmoid(v_f) o = tf.nn.sigmoid(v_o) c_new = f * state.c + i * g c_tanh = tf.nn.tanh(self._layer_norm(c_new, self.gamma_h, self.beta_h)) h_new = o * c_tanh return h_new, rnn_cell.LSTMStateTuple(c_new, h_new) def _layer_norm(self, x, gamma, beta): mean, variance = tf.nn.moments(x, axes=[-1], keepdims=True) return tf.nn.batch_normalization(x, mean, variance, beta, gamma, 1e-5) ================================================ FILE: frameworks/tf/lstm.cc ================================================ // Copyright 2020 LMNT, Inc. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // ============================================================================== #include #include "haste.h" #include "support.h" #include "tensorflow/core/framework/op.h" #include "tensorflow/core/framework/op_kernel.h" #include "tensorflow/core/framework/shape_inference.h" #include "tensorflow/core/util/stream_executor_util.h" #include "tensorflow/stream_executor/stream.h" using namespace tensorflow; using haste::v0::lstm::ForwardPass; using haste::v0::lstm::BackwardPass; using tensorflow::se::Stream; using tensorflow::shape_inference::DimensionHandle; using tensorflow::shape_inference::InferenceContext; using tensorflow::shape_inference::ShapeHandle; // Define the interface and shape function for the op. REGISTER_OP("HasteLstm") .Attr("R: {float, double}") // Some real number type. .Attr("training: bool") .Attr("zoneout_prob: float") .Input("x: R") // [T,N,C] .Input("kernel: R") // [C,H*4] .Input("recurrent_kernel: R") // [H,H*4] .Input("bias: R") // [H*4] .Input("zoneout_mask: R") // [T,N,H] .Output("h: R") // [T,N,H] .Output("c: R") // [T,N,H] .Output("v: R") // [T,N,H*4] .SetShapeFn([](InferenceContext* c) { ShapeHandle input_shape; ShapeHandle kernel_shape; ShapeHandle recurrent_shape; ShapeHandle bias_shape; ShapeHandle zoneout_mask_shape; TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 3, &input_shape)); TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 2, &kernel_shape)); TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 2, &recurrent_shape)); TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 1, &bias_shape)); TF_RETURN_IF_ERROR(c->WithRank(c->input(4), 3, &zoneout_mask_shape)); const DimensionHandle time_steps = c->Dim(input_shape, 0); const DimensionHandle batch_size = c->Dim(input_shape, 1); const DimensionHandle hidden_size = c->Dim(recurrent_shape, 0); DimensionHandle time_steps_plus_1; DimensionHandle hidden_size_4; TF_RETURN_IF_ERROR(c->Add(time_steps, 1, &time_steps_plus_1)); TF_RETURN_IF_ERROR(c->Multiply(hidden_size, 4, &hidden_size_4)); c->set_output(0, c->MakeShape({ time_steps_plus_1, batch_size, hidden_size })); c->set_output(1, c->MakeShape({ time_steps_plus_1, batch_size, hidden_size })); c->set_output(2, c->MakeShape({ time_steps, batch_size, hidden_size_4 })); return Status::OK(); }); template struct HasteLstmOp : public OpKernel { explicit HasteLstmOp(OpKernelConstruction* context) : OpKernel(context) { OP_REQUIRES_OK(context, context->GetAttr("training", &training_)); OP_REQUIRES_OK(context, context->GetAttr("zoneout_prob", &zoneout_prob_)); } // When running on GPU, TF backs all inputs and outputs with device memory // and not host memory. We don't need to do explicit memory copies or allocations // for the inputs and outputs. void Compute(OpKernelContext* context) override { const Tensor& input = context->input(0); const Tensor& kernel = context->input(1); const Tensor& recurrent_kernel = context->input(2); const Tensor& bias = context->input(3); const Tensor& zoneout_mask = context->input(4); const auto time_steps = input.shape().dim_size(0); const auto batch_size = input.shape().dim_size(1); const auto input_size = input.shape().dim_size(2); const auto hidden_size = recurrent_kernel.shape().dim_size(0); const bool has_zoneout = zoneout_prob_ && zoneout_mask.NumElements(); const auto data_type = DataTypeToEnum::value; OP_REQUIRES(context, input_size == kernel.shape().dim_size(0), errors::InvalidArgument("input[2] and kernel[0] dimensions must match. Found ", input_size, " and ", kernel.shape().dim_size(0))); const TensorShape output_shape = { time_steps + 1, batch_size, hidden_size }; const TensorShape activations_shape = { time_steps, batch_size, hidden_size * 4 }; Tensor* output = nullptr; OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, &output)); Tensor* output_cell_state = nullptr; OP_REQUIRES_OK(context, context->allocate_output(1, output_shape, &output_cell_state)); Tensor output_v_temp; Tensor* output_v = nullptr; if (training_) { OP_REQUIRES_OK(context, context->allocate_output(2, activations_shape, &output_v)); } else { // Return an empty tensor in inference mode and provide temp memory // to the forward pass instead. OP_REQUIRES_OK(context, context->allocate_output(2, TensorShape({ 0 }), &output_v)); OP_REQUIRES_OK(context, context->allocate_temp(data_type, activations_shape, &output_v_temp)); output_v = &output_v_temp; } Tensor tmp_Rh; const TensorShape tmp_Rh_shape = { batch_size, 4 * hidden_size }; OP_REQUIRES_OK(context, context->allocate_temp(data_type, tmp_Rh_shape, &tmp_Rh)); cudaMemset(output->flat().data(), 0, output->AllocatedBytes()); cudaMemset(output_cell_state->flat().data(), 0, output_cell_state->AllocatedBytes()); ForwardPass forward = ForwardPass( training_, batch_size, input_size, hidden_size, GetCublasHandle(context)); forward.Run( time_steps, kernel.flat().data(), recurrent_kernel.flat().data(), bias.flat().data(), input.flat().data(), output->flat().data(), output_cell_state->flat().data(), output_v->flat().data(), tmp_Rh.flat().data(), has_zoneout ? zoneout_prob_ : 0.0f, has_zoneout ? zoneout_mask.flat().data() : nullptr); } private: bool training_; float zoneout_prob_; }; REGISTER_GPU_KERNEL(HasteLstm, float); REGISTER_GPU_KERNEL(HasteLstm, double); REGISTER_OP("HasteLstmGrad") .Attr("R: {float, double}") .Input("x_t: R") // [C,N,T] .Input("kernel_t: R") // [H*4,C] .Input("recurrent_kernel_t: R") // [H*4,H] .Input("bias: R") // [H*4] .Input("h: R") // [T,N,H] .Input("c: R") // [T,N,H] .Input("v: R") // [T,N,H*4] .Input("dh_new: R") // [T,N,H] .Input("dc_new: R") // [T,N,H] .Input("zoneout_mask: R") // [T,N,H] .Output("dx: R") // [T,N,C] .Output("dw: R") // [C,H*4] .Output("dr: R") // [H,H*4] .Output("db: R") // [H*4] .SetShapeFn([](InferenceContext* c) { ShapeHandle x_shape; ShapeHandle kernel_shape; ShapeHandle recurrent_kernel_shape; ShapeHandle bias_shape; ShapeHandle h_shape; ShapeHandle c_shape; ShapeHandle v_shape; ShapeHandle dh_new_shape; ShapeHandle dc_new_shape; ShapeHandle zoneout_mask_shape; TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 3, &x_shape)); TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 2, &kernel_shape)); TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 2, &recurrent_kernel_shape)); TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 1, &bias_shape)); TF_RETURN_IF_ERROR(c->WithRank(c->input(4), 3, &h_shape)); TF_RETURN_IF_ERROR(c->WithRank(c->input(5), 3, &c_shape)); TF_RETURN_IF_ERROR(c->WithRank(c->input(6), 3, &v_shape)); TF_RETURN_IF_ERROR(c->WithRank(c->input(7), 3, &dh_new_shape)); TF_RETURN_IF_ERROR(c->WithRank(c->input(8), 3, &dc_new_shape)); TF_RETURN_IF_ERROR(c->WithRank(c->input(9), 3, &zoneout_mask_shape)); DimensionHandle input_size = c->Dim(x_shape, 0); DimensionHandle time_steps = c->Dim(x_shape, 1); DimensionHandle batch_size = c->Dim(x_shape, 2); DimensionHandle hidden_size = c->Dim(recurrent_kernel_shape, 1); DimensionHandle hidden_size_4; TF_RETURN_IF_ERROR(c->Multiply(hidden_size, 4, &hidden_size_4)); c->set_output(0, c->MakeShape({ time_steps, batch_size, input_size })); c->set_output(1, c->MakeShape({ input_size, hidden_size_4 })); c->set_output(2, c->MakeShape({ hidden_size, hidden_size_4 })); c->set_output(3, bias_shape); return Status::OK(); }); template struct HasteLstmGradOp : public OpKernel { explicit HasteLstmGradOp(OpKernelConstruction* context) : OpKernel(context) {} void Compute(OpKernelContext* context) override { const Tensor& input = context->input(0); const Tensor& kernel = context->input(1); const Tensor& recurrent_kernel = context->input(2); const Tensor& bias = context->input(3); const Tensor& h_vector = context->input(4); const Tensor& c_vector = context->input(5); const Tensor& dv = context->input(6); const Tensor& dh_new = context->input(7); const Tensor& dc_new = context->input(8); const Tensor& zoneout_mask = context->input(9); const auto input_size = input.shape().dim_size(0); const auto time_steps = input.shape().dim_size(1); const auto batch_size = input.shape().dim_size(2); const auto hidden_size = recurrent_kernel.shape().dim_size(1); const bool has_zoneout = !!zoneout_mask.NumElements(); const auto data_type = DataTypeToEnum::value; // Can be uninitialized. Output only, no accumulation. const TensorShape dx_shape = { time_steps, batch_size, input_size }; Tensor* dx = nullptr; OP_REQUIRES_OK(context, context->allocate_output(0, dx_shape, &dx)); // Needs to be initialized to 0. const TensorShape dW_shape = { input_size, hidden_size * 4 }; Tensor* dW = nullptr; OP_REQUIRES_OK(context, context->allocate_output(1, dW_shape, &dW)); // Needs to be initialized to 0. const TensorShape dR_shape = { hidden_size, hidden_size * 4 }; Tensor* dR = nullptr; OP_REQUIRES_OK(context, context->allocate_output(2, dR_shape, &dR)); // Needs to be initialized to 0. const TensorShape db_shape = { hidden_size * 4 }; Tensor* db = nullptr; OP_REQUIRES_OK(context, context->allocate_output(3, db_shape, &db)); // Needs to be initialized to 0. const TensorShape dh_shape = { batch_size, hidden_size }; Tensor dh; OP_REQUIRES_OK(context, context->allocate_temp(data_type, dh_shape, &dh)); // Needs to be initialized to 0. const TensorShape dc_shape = { batch_size, hidden_size }; Tensor dc; OP_REQUIRES_OK(context, context->allocate_temp(data_type, dc_shape, &dc)); cudaMemset(dW->flat().data(), 0, dW->AllocatedBytes()); cudaMemset(dR->flat().data(), 0, dR->AllocatedBytes()); cudaMemset(db->flat().data(), 0, db->AllocatedBytes()); cudaMemset(dh.flat().data(), 0, dh.AllocatedBytes()); cudaMemset(dc.flat().data(), 0, dc.AllocatedBytes()); BackwardPass backward = BackwardPass( batch_size, input_size, hidden_size, GetCublasHandle(context)); backward.Run( time_steps, kernel.flat().data(), recurrent_kernel.flat().data(), bias.flat().data(), input.flat().data(), h_vector.flat().data(), c_vector.flat().data(), dh_new.flat().data(), dc_new.flat().data(), dx->flat().data(), dW->flat().data(), dR->flat().data(), db->flat().data(), dh.flat().data(), dc.flat().data(), const_cast(dv.flat().data()), has_zoneout ? zoneout_mask.flat().data() : nullptr); } }; REGISTER_GPU_KERNEL(HasteLstmGrad, float); REGISTER_GPU_KERNEL(HasteLstmGrad, double); ================================================ FILE: frameworks/tf/lstm.py ================================================ # Copyright 2020 LMNT, Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== """Long Short-Term Memory""" import pkg_resources import tensorflow as tf from tensorflow.compat import v1 from tensorflow.compat.v1.nn import rnn_cell from .base_rnn import BaseRNN from .weight_config import WeightConfig __all__ = [ 'LSTM' ] LIB = tf.load_op_library(pkg_resources.resource_filename(__name__, 'libhaste_tf.so')) @tf.RegisterGradient("HasteLstm") def lstm_gradient(op, *grads): training = op.get_attr('training') if not training: raise ValueError(('LSTM can only compute gradients if `training=True` was specified during the ' 'forward pass.\nFailed op: {}').format(op.name)) # Extract inputs and outputs from the op. x = op.inputs[0] W = op.inputs[1] R = op.inputs[2] b = op.inputs[3] zoneout_mask = op.inputs[4] h = op.outputs[0] c = op.outputs[1] v = op.outputs[2] # Pre-transpose matrices for better performance. x = tf.transpose(x, [2, 0, 1]) W = tf.transpose(W, [1, 0]) R = tf.transpose(R, [1, 0]) dx, dW, dR, db = LIB.haste_lstm_grad(x, W, R, b, h, c, v, grads[0], grads[1], zoneout_mask) return [dx, dW, dR, db, None] class LSTMLayer(tf.Module): def __init__(self, num_units, kernel_initializer=None, recurrent_initializer=None, bias_initializer=None, kernel_transform=None, recurrent_transform=None, bias_transform=None, forget_bias=1.0, dropout=0.0, zoneout=0.0, dtype=None, name=None, cudnn_compat=False): super(LSTMLayer, self).__init__(name) self.realname = name self.input_size = None self.num_units = num_units identity = lambda x: x self.kernel_config = WeightConfig(v1.initializers.glorot_uniform(), None, identity) self.recurrent_config = WeightConfig(v1.initializers.orthogonal(), None, identity) self.bias_config = WeightConfig(v1.initializers.zeros(), None, identity) self.kernel_config.override(kernel_initializer, None, kernel_transform) self.recurrent_config.override(recurrent_initializer, None, recurrent_transform) self.bias_config.override(bias_initializer, None, bias_transform) self.forget_bias = forget_bias self.dropout = dropout self.zoneout = zoneout self.dtype = dtype or tf.float32 self.cudnn_compat = cudnn_compat self.opaque = None self.kernel = None self.bias = None self.built = False def build(self, shape): if self.built: return num_units = self.num_units input_size = int(shape[-1]) kernel_shape = tf.TensorShape([input_size, num_units]) recurrent_shape = tf.TensorShape([num_units, num_units]) bias_shape = tf.TensorShape([num_units]) kernel_weights = [self.kernel_config.initializer(kernel_shape, dtype=self.dtype) for _ in range(4)] recurrent_weights = [self.recurrent_config.initializer(recurrent_shape, dtype=self.dtype) for _ in range(4)] if self.forget_bias: biases = [tf.zeros(bias_shape, dtype=self.dtype) for _ in range(4)] biases[2] = tf.constant(self.forget_bias, shape=bias_shape, dtype=self.dtype) else: biases = [self.bias_config.initializer(bias_shape, dtype=self.dtype) for _ in range(4)] kernel_weights = tf.concat(kernel_weights, axis=-1) recurrent_weights = tf.concat(recurrent_weights, axis=-1) biases = tf.concat(biases, axis=-1) if not self.cudnn_compat: # Use the same format as LSTMBlockCell. with self.name_scope, v1.variable_scope(self.realname, 'lstm_cell'): weights = tf.concat([kernel_weights, recurrent_weights], axis=0) self.kernel = v1.get_variable('kernel', initializer=weights) self.bias = v1.get_variable('bias', initializer=biases) else: # Use the same format as CudnnLSTM. with self.name_scope, v1.variable_scope(self.realname, 'lstm_cell'): with v1.variable_scope('cudnn_lstm'): # Sigh, cuDNN uses two bias vectors instead of just one. extra_biases = [self.bias_initializer(tf.TensorShape([num_units]), dtype=self.dtype) for _ in range(4)] extra_biases = tf.concat(extra_biases, axis=-1) kernel_weights = tf.reshape(kernel_weights, [-1]) recurrent_weights = tf.reshape(recurrent_weights, [-1]) opaque_initial_value = tf.concat([kernel_weights, recurrent_weights, biases, extra_biases], axis=-1) self.opaque = v1.get_variable('opaque_kernel', initializer=opaque_initial_value) self.input_size = input_size self.built = True def get_weights(self): if self.cudnn_compat: # Split into 3 variables. W_size = 4 * self.input_size * self.num_units R_size = 4 * self.num_units * self.num_units b_size = 8 * self.num_units kernel, recurrent_kernel, bias = tf.split(opaque, [W_size, R_size, b_size]) # Convert from cuDNN [i, f, g, o] format to TF and LMNT [i, g, f, o] format. # Note that we only use a single bias vector so we sum the two separate ones # and then reorder formats. Wi, Wf, Wg, Wo = tf.split(kernel, 4) Ri, Rf, Rg, Ro = tf.split(recurrent_kernel, 4) bi, bf, bg, bo = tf.split(tf.reduce_sum(tf.split(bias, 2), axis=0), 4) kernel = tf.concat([Wi, Wg, Wf, Wo], axis=0) recurrent_kernel = tf.concat([Ri, Rg, Rf, Ro], axis=0) bias = tf.concat([bi, bg, bf, bo], axis=0) # Shape them correctly. kernel = tf.reshape(kernel, [4 * self.num_units, self.input_size]) recurrent_kernel = tf.reshape(recurrent_kernel, [4 * self.num_units, self.num_units]) bias = tf.reshape(bias, [4 * self.num_units]) # Pre-transpose the kernels. kernel = tf.transpose(kernel, [1, 0]) recurrent_kernel = tf.transpose(recurrent_kernel, [1, 0]) else: kernel = self.kernel[:-self.num_units] recurrent_kernel = self.kernel[-self.num_units:] bias = self.bias return { 'kernel': self.kernel_config.transform(kernel), 'recurrent_kernel': self.recurrent_config.transform(recurrent_kernel), 'bias': self.bias_config.transform(bias) } @property def state_size(self): return rnn_cell.LSTMStateTuple(self.num_units, self.num_units) @property def output_size(self): return self.num_units def __call__(self, x, sequence_length, training): self.build(x.shape) shape = tf.shape(x) time_steps = shape[0] batch_size = shape[1] # Use an empty zoneout mask if no zoneout is going to be applied. # Sadly, we can't pass `None` to the op but at least we won't be wasting # memory or bandwidth on this tensor. zoneout_mask = tf.zeros([0, 0, 0], dtype=self.dtype) if self.zoneout: zoneout_mask = 1.0 - self.zoneout zoneout_mask += tf.random.uniform([time_steps, batch_size, self.num_units], dtype=self.dtype) zoneout_mask = tf.floor(zoneout_mask) weights = self.get_weights() if training and self.dropout > 0: recurrent_kernel = tf.nn.dropout(weights['recurrent_kernel'], rate=self.dropout) else: recurrent_kernel = weights['recurrent_kernel'] h, c, _ = LIB.haste_lstm( x, weights['kernel'], recurrent_kernel, weights['bias'], zoneout_mask, training=training, zoneout_prob=self.zoneout) if sequence_length is not None: indices = sequence_length indices = tf.stack([indices, tf.range(batch_size, dtype=sequence_length.dtype)], axis=-1) state = rnn_cell.LSTMStateTuple(tf.gather_nd(c, indices), tf.gather_nd(h, indices)) else: state = rnn_cell.LSTMStateTuple(c[-1], h[-1]) return h[1:], state class LSTM(BaseRNN): """ Long Short-Term Memory layer. This LSTM layer offers a fused, GPU-accelerated TensorFlow op for inference and training. Its weights and variables are compatible with `BasicLSTMCell`, `LSTMCell`, and `LSTMBlockCell` by default, and is able to load weights from `tf.contrib.cudnn_rnn.CudnnLSTM` when `cudnn_compat=True` is specified. Although this implementation is comparable in performance to cuDNN's LSTM, it offers additional options not typically found in other high-performance implementations. DropConnect and Zoneout regularization are built-in, and this layer allows setting a non-zero initial forget gate bias. """ def __init__(self, num_units, direction='unidirectional', **kwargs): """ Initialize the parameters of the LSTM layer. Arguments: num_units: int, the number of units in the LSTM cell. direction: string, 'unidirectional' or 'bidirectional'. **kwargs: Dict, keyword arguments (see below). Keyword Arguments: kernel_initializer: (optional) the initializer to use for the input matrix weights. Defaults to `glorot_uniform`. recurrent_initializer: (optional) the initializer to use for the recurrent matrix weights. Defaults to `orthogonal`. bias_initializer: (optional) the initializer to use for both input and recurrent bias vectors. Defaults to `zeros` unless `forget_bias` is non-zero (see below). kernel_transform: (optional) a function with signature `(kernel: Tensor) -> Tensor` that transforms the kernel before it is used. Defaults to the identity function. recurrent_transform: (optional) a function with signature `(recurrent_kernel: Tensor) -> Tensor` that transforms the recurrent kernel before it is used. Defaults to the identity function. bias_transform: (optional) a function with signature `(bias: Tensor) -> Tensor` that transforms the bias before it is used. Defaults to the identity function. forget_bias: (optional) float, sets the initial weights for the forget gates. Defaults to 1 and overrides the `bias_initializer` unless this argument is set to 0. dropout: (optional) float, sets the dropout rate for DropConnect regularization on the recurrent matrix. Defaults to 0. zoneout: (optional) float, sets the zoneout rate for Zoneout regularization. Defaults to 0. dtype: (optional) the data type for this layer. Defaults to `tf.float32`. name: (optional) string, the name for this layer. cudnn_compat: (optional) bool, if `True`, the variables created by this layer are compatible with `tf.contrib.cudnn_rnn.CudnnLSTM`. Note that this should only be set if you're restoring variables from a cuDNN model. It's currently not possible to train a model with `cudnn_compat=True` and restore it with CudnnLSTM. Defaults to `False`. """ super().__init__(LSTMLayer, num_units, direction, 'lstm_cell', **kwargs) ================================================ FILE: frameworks/tf/support.cc ================================================ // Copyright 2020 LMNT, Inc. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // ============================================================================== #include #include #include #include #include #include "support.h" #include "tensorflow/core/framework/op_kernel.h" #include "tensorflow/core/util/stream_executor_util.h" #include "tensorflow/stream_executor/stream.h" using Key = std::pair; namespace std { template<> struct hash { size_t operator()(const Key& key) const noexcept { const auto h1 = std::hash{}(key.first); const auto h2 = std::hash{}(key.second); return h1 ^ (h2 << 1); } }; } // LOL. cublasHandle_t GetCublasHandle(tensorflow::OpKernelContext* context) { static std::unordered_map handle_map; static std::mutex mutex; int device; std::lock_guard lock(mutex); std::thread::id tid = std::this_thread::get_id(); cudaGetDevice(&device); cudaStream_t stream = GetCudaStream(context); auto item = std::make_pair(device, tid); auto i = handle_map.find(item); if (i == std::end(handle_map)) { cublasHandle_t handle; cublasCreate(&handle); cublasSetStream(handle, stream); i = handle_map.insert(std::make_pair(item, handle)).first; } return i->second; } const cudaStream_t& GetCudaStream(tensorflow::OpKernelContext* context) { const auto ptr = context->op_device_context()->stream()->implementation()->GpuStreamMemberHack(); return *reinterpret_cast(ptr); } ================================================ FILE: frameworks/tf/support.h ================================================ // Copyright 2020 LMNT, Inc. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // ============================================================================== #pragma once #include #include namespace tensorflow { class OpKernelContext; } #define REGISTER_GPU_KERNEL(NAME, T) \ REGISTER_KERNEL_BUILDER(Name(#NAME) \ .Device(DEVICE_GPU) \ .TypeConstraint("R"), \ NAME##Op) cublasHandle_t GetCublasHandle(tensorflow::OpKernelContext* context); const cudaStream_t& GetCudaStream(tensorflow::OpKernelContext* context); ================================================ FILE: frameworks/tf/weight_config.py ================================================ # Copyright 2020 LMNT, Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== class WeightConfig: __slots__ = ['initializer', 'constraint', 'transform'] def __init__(self, initializer=None, constraint=None, transform=None): self.initializer = initializer self.constraint = constraint self.transform = transform def override(self, initializer, constraint, transform): if initializer is not None: self.initializer = initializer if constraint is not None: self.constraint = constraint if transform is not None: self.transform = transform return self ================================================ FILE: frameworks/tf/zoneout_wrapper.py ================================================ # Copyright 2020 LMNT, Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== """An RNN cell wrapper that applies Zoneout.""" import tensorflow as tf from tensorflow.compat.v1.nn import rnn_cell __all__ = [ 'ZoneoutWrapper' ] class ZoneoutWrapper(rnn_cell.RNNCell): """ An LSTM/GRU cell wrapper that applies zoneout to the inner cell's hidden state. The zoneout paper applies zoneout to both the cell state and hidden state, each with its own zoneout rate. This class (and the `LSTM` implementation in Haste) applies zoneout to the hidden state and not the cell state. """ def __init__(self, cell, rate, training): """ Initialize the parameters of the zoneout wrapper. Arguments: cell: RNNCell, an instance of {`BasicLSTMCell`, `LSTMCell`, `LSTMBlockCell`, `haste_tf.GRUCell`} on which to apply zoneout. rate: float, 0 <= rate <= 1, the percent of hidden units to zone out per time step. training: bool, `True` if used during training, `False` if used during inference. """ super(ZoneoutWrapper, self).__init__() self.cell = cell self.rate = rate self.training = training @property def state_size(self): return self.cell.state_size @property def output_size(self): return self.cell.output_size def __call__(self, inputs, state, scope=None): """ Runs one step of the RNN cell with zoneout applied. Arguments: see documentation for the inner cell. """ output, new_state = self.cell(inputs, state, scope) # Zoneout disabled if not self.rate: return output, new_state if isinstance(new_state, rnn_cell.LSTMStateTuple): zoned_out_h = self._apply_zoneout(new_state.h, state.h) return output, rnn_cell.LSTMStateTuple(new_state.c, zoned_out_h) elif isinstance(new_state, list) and len(new_state) == 1: return output, self._apply_zoneout(new_state[0], state[0]) elif isinstance(new_state, tf.Tensor): return output, self._apply_zoneout(new_state, state) else: raise ValueError(('ZoneoutWrapper wraps cells that return LSTMStateTuple or ' 'unnested state Tensors. Please use one of the following cell types:\n' ' tf.nn.rnn_cell.BasicLSTMCell\n' ' tf.nn.rnn_cell.LSTMCell\n' ' tf.contrib.rnn.LSTMBlockCell\n' ' haste_tf.GRUCell')) def _apply_zoneout(self, new_tensor, old_tensor): if self.training: mask = self._build_mask(tf.shape(new_tensor)) zoned_out = (new_tensor - old_tensor) * mask + old_tensor else: zoned_out = self.rate * old_tensor + (1.0 - self.rate) * new_tensor return zoned_out def _build_mask(self, shape): mask = 1 - self.rate mask += tf.random.uniform(shape) return tf.floor(mask) ================================================ FILE: lib/blas.h ================================================ // Copyright 2020 LMNT, Inc. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // ============================================================================== #pragma once #include template struct blas { struct set_pointer_mode { set_pointer_mode(cublasHandle_t handle) : handle_(handle) { cublasGetPointerMode(handle_, &old_mode_); cublasSetPointerMode(handle_, CUBLAS_POINTER_MODE_HOST); } ~set_pointer_mode() { cublasSetPointerMode(handle_, old_mode_); } private: cublasHandle_t handle_; cublasPointerMode_t old_mode_; }; struct enable_tensor_cores { enable_tensor_cores(cublasHandle_t handle) : handle_(handle) { cublasGetMathMode(handle_, &old_mode_); cublasSetMathMode(handle_, CUBLAS_TENSOR_OP_MATH); } ~enable_tensor_cores() { cublasSetMathMode(handle_, old_mode_); } private: cublasHandle_t handle_; cublasMath_t old_mode_; }; }; template<> struct blas<__half> { static constexpr decltype(cublasHgemm)* gemm = &cublasHgemm; }; template<> struct blas { static constexpr decltype(cublasSgemm)* gemm = &cublasSgemm; }; template<> struct blas { static constexpr decltype(cublasDgemm)* gemm = &cublasDgemm; }; ================================================ FILE: lib/device_assert.h ================================================ // Copyright 2020 LMNT, Inc. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // ============================================================================== #pragma once extern "C" __host__ __device__ void __assertfail( const char * __assertion, const char *__file, unsigned int __line, const char *__function, size_t charsize); #define device_assert_fail(msg) \ __assertfail((msg), __FILE__, __LINE__, __PRETTY_FUNCTION__, sizeof(char)) ================================================ FILE: lib/gru_backward_gpu.cu.cc ================================================ // Copyright 2020 LMNT, Inc. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // ============================================================================== #include #include #include "blas.h" #include "device_assert.h" #include "haste.h" #include "inline_ops.h" namespace { template __global__ void PointwiseOperations(const int batch_dim, const int hidden_dim, const T* h, const T* v, const T* dh_new, T* dbx_out, T* dbr_out, T* dh_inout, T* dp_out, T* dq_out, const T* zoneout_mask) { // Zoneout mask (only used if ApplyZoneout==true) const int row = blockDim.x * blockIdx.x + threadIdx.x; const int col = blockDim.y * blockIdx.y + threadIdx.y; if (row >= hidden_dim || col >= batch_dim) return; const int base_idx = col * hidden_dim + row; T dh_total = dh_new[base_idx] + dh_inout[base_idx]; const int stride4_base_idx = col * (hidden_dim * 4) + row; const int z_idx = stride4_base_idx + 0 * hidden_dim; const int r_idx = stride4_base_idx + 1 * hidden_dim; const int g_idx = stride4_base_idx + 2 * hidden_dim; const int q_g_idx = stride4_base_idx + 3 * hidden_dim; const T z = v[z_idx]; const T r = v[r_idx]; const T g = v[g_idx]; const T q_g = v[q_g_idx]; if (ApplyZoneout) { const T mask = zoneout_mask[base_idx]; dh_inout[base_idx] = (static_cast(1.0) - mask) * dh_total; dh_total = mask * dh_total; dh_inout[base_idx] += z * dh_total; } else { dh_inout[base_idx] = z * dh_total; } const T dg = (static_cast(1.0) - z) * dh_total; const T dz = (h[base_idx] - g) * dh_total; const T dp_g = d_tanh(g) * dg; const T dq_g = dp_g * r; const T dr = dp_g * q_g; const T dp_r = d_sigmoid(r) * dr; const T dq_r = dp_r; const T dp_z = d_sigmoid(z) * dz; const T dq_z = dp_z; const int idx = col * (hidden_dim * 3) + row; dp_out[idx + 0 * hidden_dim] = dp_z; dp_out[idx + 1 * hidden_dim] = dp_r; dp_out[idx + 2 * hidden_dim] = dp_g; dq_out[idx + 0 * hidden_dim] = dq_z; dq_out[idx + 1 * hidden_dim] = dq_r; dq_out[idx + 2 * hidden_dim] = dq_g; atomicAdd(&dbx_out[row + 0 * hidden_dim], dp_z); atomicAdd(&dbx_out[row + 1 * hidden_dim], dp_r); atomicAdd(&dbx_out[row + 2 * hidden_dim], dp_g); atomicAdd(&dbr_out[row + 0 * hidden_dim], dq_z); atomicAdd(&dbr_out[row + 1 * hidden_dim], dq_r); atomicAdd(&dbr_out[row + 2 * hidden_dim], dq_g); } #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 700) template __global__ void PointwiseOperations(const int batch_dim, const int hidden_dim, const half* h, const half* v, const half* dh_new, half* dbx_out, half* dbr_out, half* dh_inout, half* dp_out, half* dq_out, const half* zoneout_mask) { device_assert_fail("FP16 is not supported on compute capability < 7.0."); } #endif } // anonymous namespace namespace haste { namespace v0 { namespace gru { template struct BackwardPass::private_data { int batch_size; int input_size; int hidden_size; cublasHandle_t blas_handle; cudaStream_t stream[2]; cudaEvent_t event; cudaStream_t sync_stream; }; template BackwardPass::BackwardPass( const int batch_size, const int input_size, const int hidden_size, const cublasHandle_t& blas_handle, const cudaStream_t& stream) : data_(new private_data) { data_->batch_size = batch_size; data_->input_size = input_size; data_->hidden_size = hidden_size; data_->blas_handle = blas_handle; data_->sync_stream = stream; cudaStreamCreate(&data_->stream[0]); cudaStreamCreate(&data_->stream[1]); cudaEventCreateWithFlags(&data_->event, cudaEventDisableTiming); } template BackwardPass::~BackwardPass() { if (data_->sync_stream) { cudaEventRecord(data_->event, data_->stream[1]); cudaStreamWaitEvent(data_->sync_stream, data_->event, 0); cudaEventRecord(data_->event, data_->stream[0]); cudaStreamWaitEvent(data_->sync_stream, data_->event, 0); } else { cudaStreamSynchronize(data_->stream[1]); cudaStreamSynchronize(data_->stream[0]); } cudaEventDestroy(data_->event); cudaStreamDestroy(data_->stream[1]); cudaStreamDestroy(data_->stream[0]); delete data_; } template void BackwardPass::Iterate( const T* W_t, // [H*3,C] const T* R_t, // [H*3,H] const T* bx, // [H*3] const T* br, // [H*3] const T* x_t, // [C,N] const T* h, // [N,H] const T* v, // [N,H*4] const T* dh_new, // [N,H] T* dx, // [N,C] T* dW, // [C,H*3] T* dR, // [H,H*3] T* dbx, // [H*3] T* dbr, // [H*3] T* dh, // [N,H] T* dp, // [N,H*3] T* dq, // [N,H*3] const T* zoneout_mask) { // [N,H] const blas::set_pointer_mode scoped1(data_->blas_handle); const T alpha = static_cast(1.0); const T beta_sum = static_cast(1.0); const T beta_assign = static_cast(0.0); const int batch_size = data_->batch_size; const int hidden_size = data_->hidden_size; const int input_size = data_->input_size; const cublasHandle_t blas_handle = data_->blas_handle; const cudaStream_t stream1 = data_->stream[0]; const cudaStream_t stream2 = data_->stream[1]; const cudaEvent_t event = data_->event; cudaStream_t save_stream; cublasGetStream(blas_handle, &save_stream); IterateInternal( R_t, h, v, dh_new, dbx, dbr, dh, dp, dq, zoneout_mask); cublasSetStream(blas_handle, stream1); blas::gemm(blas_handle, CUBLAS_OP_N, CUBLAS_OP_N, hidden_size * 3, input_size, batch_size, &alpha, dp, hidden_size * 3, x_t, batch_size, &beta_sum, dW, hidden_size * 3); // Wait for pointwise operations to complete since there's a // data dependency between its output (`dp`, `dq`) and the following matmuls. cudaStreamWaitEvent(stream2, event, 0); cublasSetStream(blas_handle, stream2); blas::gemm(blas_handle, CUBLAS_OP_N, CUBLAS_OP_N, input_size, batch_size, hidden_size * 3, &alpha, W_t, input_size, dp, hidden_size * 3, &beta_assign, dx, input_size); cublasSetStream(blas_handle, stream2); blas::gemm(blas_handle, CUBLAS_OP_N, CUBLAS_OP_T, hidden_size * 3, hidden_size, batch_size, &alpha, dq, hidden_size * 3, h, hidden_size, &beta_sum, dR, hidden_size * 3); cublasSetStream(blas_handle, save_stream); } template void BackwardPass::IterateInternal( const T* R_t, // [H*3,H] const T* h, // [N,H] const T* v, // [N,H*4] const T* dh_new, // [N,H] T* dbx, // [H*3] T* dbr, // [H*3] T* dh, // [N,H] T* dp, // [N,H*3] T* dq, // [N,H*3] const T* zoneout_mask) { // [N,H] const T alpha = static_cast(1.0); const T beta_sum = static_cast(1.0); const int batch_size = data_->batch_size; const int hidden_size = data_->hidden_size; const cublasHandle_t blas_handle = data_->blas_handle; const cudaStream_t stream1 = data_->stream[0]; const cudaEvent_t event = data_->event; // Compute launch configuration for pointwise operations kernel. const dim3 blockDim(32, 16); const dim3 gridDim( (hidden_size + blockDim.x - 1) / blockDim.x, (batch_size + blockDim.y - 1) / blockDim.y); if (zoneout_mask) { PointwiseOperations<<>>( batch_size, hidden_size, h, v, dh_new, dbx, dbr, dh, dp, dq, zoneout_mask ); } else { PointwiseOperations<<>>( batch_size, hidden_size, h, v, dh_new, dbx, dbr, dh, dp, dq, nullptr ); } cudaEventRecord(event, stream1); cublasSetStream(blas_handle, stream1); blas::gemm(blas_handle, CUBLAS_OP_N, CUBLAS_OP_N, hidden_size, batch_size, hidden_size * 3, &alpha, R_t, hidden_size, dq, hidden_size * 3, &beta_sum, dh, hidden_size); } template void BackwardPass::Run( const int steps, const T* W_t, const T* R_t, const T* bx, const T* br, const T* x_t, const T* h, const T* v, const T* dh_new, T* dx, T* dW, T* dR, T* dbx, T* dbr, T* dh, T* dp, T* dq, const T* zoneout_mask) { const blas::enable_tensor_cores scoped0(data_->blas_handle); const blas::set_pointer_mode scoped1(data_->blas_handle); const T alpha = static_cast(1.0); const T beta_sum = static_cast(1.0); const T beta_assign = static_cast(0.0); const int batch_size = data_->batch_size; const int input_size = data_->input_size; const int hidden_size = data_->hidden_size; const cublasHandle_t blas_handle = data_->blas_handle; const cudaStream_t stream1 = data_->stream[0]; const cudaStream_t stream2 = data_->stream[1]; const cudaEvent_t event = data_->event; cudaStream_t save_stream; cublasGetStream(blas_handle, &save_stream); const int NH = batch_size * hidden_size; for (int i = steps - 1; i >= 0; --i) { IterateInternal( R_t, h + i * NH, v + i * NH * 4, dh_new + (i + 1) * NH, dbx, dbr, dh, dp + i * NH * 3, dq + i * NH * 3, zoneout_mask ? zoneout_mask + i * NH : nullptr ); } // Wait for pointwise operations to complete since there's a // data dependency between its output (`dp`, `dq`) and the following matmuls. cudaStreamWaitEvent(stream2, event, 0); cublasSetStream(blas_handle, stream2); blas::gemm(blas_handle, CUBLAS_OP_N, CUBLAS_OP_N, input_size, batch_size * steps, hidden_size * 3, &alpha, W_t, input_size, dp, hidden_size * 3, &beta_assign, dx, input_size); cublasSetStream(blas_handle, stream2); blas::gemm(blas_handle, CUBLAS_OP_N, CUBLAS_OP_T, hidden_size * 3, hidden_size, batch_size * steps, &alpha, dq, hidden_size * 3, h, hidden_size, &beta_sum, dR, hidden_size * 3); cublasSetStream(blas_handle, stream1); blas::gemm(blas_handle, CUBLAS_OP_N, CUBLAS_OP_N, hidden_size * 3, input_size, batch_size * steps, &alpha, dp, hidden_size * 3, x_t, batch_size * steps, &beta_sum, dW, hidden_size * 3); cublasSetStream(blas_handle, save_stream); } template struct BackwardPass; template struct BackwardPass; template struct BackwardPass; } // namespace gru } // namespace v0 } // namespace haste ================================================ FILE: lib/gru_forward_gpu.cu.cc ================================================ // Copyright 2020 LMNT, Inc. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // ============================================================================== #include #include #include #include "blas.h" #include "device_assert.h" #include "haste.h" #include "inline_ops.h" namespace { template __global__ void PointwiseOperations(const int batch_dim, const int hidden_dim, const T* Wx, const T* Rh, const T* bx, const T* br, const T* h, T* h_out, T* v, const T zoneout_prob, const T* zoneout_mask) { // Zoneout mask (only used if ApplyZoneout==true) const int row = blockDim.x * blockIdx.x + threadIdx.x; const int col = blockDim.y * blockIdx.y + threadIdx.y; if (row >= hidden_dim || col >= batch_dim) return; const int weight_idx = col * (hidden_dim * 3) + row; // Index into the `h` and `h_out` vectors (they have a stride of `hidden_dim`). const int output_idx = col * hidden_dim + row; // Indicies into the Wx and Rh matrices (for each of the u, r, and e components). const int z_idx = weight_idx + 0 * hidden_dim; const int r_idx = weight_idx + 1 * hidden_dim; const int g_idx = weight_idx + 2 * hidden_dim; // Indices into the bias vectors (for each of the u, r, and e components). const int bz_idx = row + 0 * hidden_dim; const int br_idx = row + 1 * hidden_dim; const int bg_idx = row + 2 * hidden_dim; const T z = sigmoid(Wx[z_idx] + Rh[z_idx] + bx[bz_idx] + br[bz_idx]); const T r = sigmoid(Wx[r_idx] + Rh[r_idx] + bx[br_idx] + br[br_idx]); const T g = tanh (Wx[g_idx] + r * (Rh[g_idx] + br[bg_idx]) + bx[bg_idx]); // Store internal activations if we're eventually going to backprop. if (Training) { const int base_v_idx = col * (hidden_dim * 4) + row; v[base_v_idx + 0 * hidden_dim] = z; v[base_v_idx + 1 * hidden_dim] = r; v[base_v_idx + 2 * hidden_dim] = g; v[base_v_idx + 3 * hidden_dim] = Rh[g_idx] + br[bg_idx]; } T cur_h_value = z * h[output_idx] + (static_cast(1.0) - z) * g; if (ApplyZoneout) { if (Training) { cur_h_value = (cur_h_value - h[output_idx]) * zoneout_mask[output_idx] + h[output_idx]; } else { cur_h_value = (zoneout_prob * h[output_idx]) + ((static_cast(1.0) - zoneout_prob) * cur_h_value); } } h_out[output_idx] = cur_h_value; } #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 700) template __global__ void PointwiseOperations(const int batch_dim, const int hidden_dim, const half* Wx, const half* Rh, const half* bx, const half* br, const half* h, half* h_out, half* v, const half zoneout_prob, const half* zoneout_mask) { device_assert_fail("FP16 is not supported on compute capability < 7.0."); } #endif } // anonymous namespace namespace haste { namespace v0 { namespace gru { template struct ForwardPass::private_data { bool training; int batch_size; int input_size; int hidden_size; cublasHandle_t blas_handle; cudaStream_t stream[2]; cudaEvent_t event; cudaStream_t sync_stream; }; template ForwardPass::ForwardPass( const bool training, const int batch_size, const int input_size, const int hidden_size, const cublasHandle_t& blas_handle, const cudaStream_t& stream) : data_(new private_data) { data_->training = training; data_->batch_size = batch_size; data_->input_size = input_size; data_->hidden_size = hidden_size; data_->blas_handle = blas_handle; data_->sync_stream = stream; cudaStreamCreate(&data_->stream[0]); cudaStreamCreate(&data_->stream[1]); cudaEventCreateWithFlags(&data_->event, cudaEventDisableTiming); } template ForwardPass::~ForwardPass() { if (data_->sync_stream) { cudaEventRecord(data_->event, data_->stream[1]); cudaStreamWaitEvent(data_->sync_stream, data_->event, 0); cudaEventRecord(data_->event, data_->stream[0]); cudaStreamWaitEvent(data_->sync_stream, data_->event, 0); } else { cudaStreamSynchronize(data_->stream[1]); cudaStreamSynchronize(data_->stream[0]); } cudaEventDestroy(data_->event); cudaStreamDestroy(data_->stream[1]); cudaStreamDestroy(data_->stream[0]); delete data_; } template void ForwardPass::Iterate( const T* W, // [C,H*3] const T* R, // [H,H*3] const T* bx, // [H*3] const T* br, // [H*3] const T* x, // [N,C] const T* h, // [N,H] T* h_out, // [N,H] T* v, // [N,H*4] T* tmp_Wx, // [N,H*3] T* tmp_Rh, // [N,H*3] const float zoneout_prob, const T* zoneout_mask) { // Zoneout mask [N,H] static const T alpha = static_cast(1.0); static const T beta = static_cast(0.0); const blas::set_pointer_mode scoped1(data_->blas_handle); const int batch_size = data_->batch_size; const int input_size = data_->input_size; const int hidden_size = data_->hidden_size; const cublasHandle_t blas_handle = data_->blas_handle; const cudaStream_t stream2 = data_->stream[1]; const cudaEvent_t event = data_->event; cudaStream_t save_stream; cublasGetStream(blas_handle, &save_stream); cublasSetStream(blas_handle, stream2); blas::gemm(blas_handle, CUBLAS_OP_N, CUBLAS_OP_N, hidden_size * 3, batch_size, input_size, &alpha, W, hidden_size * 3, x, input_size, &beta, tmp_Wx, hidden_size * 3); cudaEventRecord(event, stream2); IterateInternal( R, bx, br, h, h_out, v, tmp_Wx, tmp_Rh, zoneout_prob, zoneout_mask); cublasSetStream(blas_handle, save_stream); } template void ForwardPass::IterateInternal( const T* R, // [H,H*3] const T* bx, // [H*3] const T* br, // [H*3] const T* h, // [N,H] T* h_out, // [N,H] T* v, // [N,H*4] T* tmp_Wx, // [N,H*3] T* tmp_Rh, // [N,H*3] const float zoneout_prob, const T* zoneout_mask) { // Zoneout mask [N,H] // Constants for GEMM static const T alpha = static_cast(1.0); static const T beta = static_cast(0.0); const bool training = data_->training; const int batch_size = data_->batch_size; const int hidden_size = data_->hidden_size; const cublasHandle_t blas_handle = data_->blas_handle; const cudaStream_t stream1 = data_->stream[0]; const cudaEvent_t event = data_->event; cublasSetStream(blas_handle, stream1); blas::gemm(blas_handle, CUBLAS_OP_N, CUBLAS_OP_N, hidden_size * 3, batch_size, hidden_size, &alpha, R, hidden_size * 3, h, hidden_size, &beta, tmp_Rh, hidden_size * 3); // Compute launch configuration for pointwise operations kernel. const dim3 blockDim(32, 16); const dim3 gridDim( (hidden_size + blockDim.x - 1) / blockDim.x, (batch_size + blockDim.y - 1) / blockDim.y); cudaStreamWaitEvent(stream1, event, 0); if (training) { if (zoneout_prob && zoneout_mask) { PointwiseOperations<<>>( batch_size, hidden_size, tmp_Wx, tmp_Rh, bx, br, h, h_out, v, zoneout_prob, zoneout_mask); } else { PointwiseOperations<<>>( batch_size, hidden_size, tmp_Wx, tmp_Rh, bx, br, h, h_out, v, 0.0f, nullptr); } } else { if (zoneout_prob && zoneout_mask) { PointwiseOperations<<>>( batch_size, hidden_size, tmp_Wx, tmp_Rh, bx, br, h, h_out, nullptr, zoneout_prob, zoneout_mask); } else { PointwiseOperations<<>>( batch_size, hidden_size, tmp_Wx, tmp_Rh, bx, br, h, h_out, nullptr, 0.0f, nullptr); } } } template void ForwardPass::Run( const int steps, const T* W, // [C,H*3] const T* R, // [H,H*3] const T* bx, // [H*3] const T* br, // [H*3] const T* x, // [N,C] T* h, // [N,H] T* v, // [N,H*4] T* tmp_Wx, // [N,H*3] T* tmp_Rh, // [N,H*3] const float zoneout_prob, const T* zoneout_mask) { // Zoneout mask [N,H] static const T alpha = static_cast(1.0); static const T beta = static_cast(0.0); const blas::enable_tensor_cores scoped0(data_->blas_handle); const blas::set_pointer_mode scoped1(data_->blas_handle); const int batch_size = data_->batch_size; const int input_size = data_->input_size; const int hidden_size = data_->hidden_size; const cublasHandle_t blas_handle = data_->blas_handle; const cudaStream_t stream2 = data_->stream[1]; const cudaEvent_t event = data_->event; cudaStream_t save_stream; cublasGetStream(blas_handle, &save_stream); cublasSetStream(blas_handle, stream2); blas::gemm(blas_handle, CUBLAS_OP_N, CUBLAS_OP_N, hidden_size * 3, steps * batch_size, input_size, &alpha, W, hidden_size * 3, x, input_size, &beta, tmp_Wx, hidden_size * 3); cudaEventRecord(event, stream2); const int NH = batch_size * hidden_size; for (int i = 0; i < steps; ++i) { IterateInternal( R, bx, br, h + i * NH, h + (i + 1) * NH, v + i * NH * 4, tmp_Wx + i * NH * 3, tmp_Rh, zoneout_prob, zoneout_mask ? zoneout_mask + i * NH : nullptr); } cublasSetStream(blas_handle, save_stream); } template struct ForwardPass; template struct ForwardPass; template struct ForwardPass; } // namespace gru } // namespace v0 } // namespace haste ================================================ FILE: lib/haste/gru.h ================================================ // Copyright 2020 LMNT, Inc. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // ============================================================================== #pragma once #include #include namespace haste { namespace v0 { namespace gru { template class ForwardPass { public: // training: `true` if the caller intends to perform a backward pass to compute gradients. // batch_size: the number of training/inference inputs provided in each tensor. // input_size: the dimension of each input vector. // hidden_size: the expected dimension of each output vector. // blas_handle: an initialized cuBLAS handle (see `cublasCreate`). ForwardPass( const bool training, const int batch_size, const int input_size, const int hidden_size, const cublasHandle_t& blas_handle, const cudaStream_t& stream = 0); // Releases internal resources. // Blocks until all iterations have completed executing on the GPU. ~ForwardPass(); // Performs one forward iteration of the GRU cell. // // W: [C,H*3] the input weight matrix. // R: [H,H*3] the recurrent weight matrix. // bx: [H*3] the bias for the input weight matrix. // br: [H*3] the bias for the recurrent weight matrix. // x: [N,C] the GRU input for this iteration (N vectors, each with dimension C). // h: [N,H] the t-1 iteration's `h_out` or the initial hidden state if this is the // t=0 iteration (typically zeros). // h_out: [N,H] the GRU's output, and the input to the next iteration's `h`. This // pointer may be the same as `h`. Each iteration may reuse the same memory region. // v: [N,H*4] if `training` is `false`, this can be a null pointer. If `training` is // `true`, this vector will contain intermediate activations for this iteration which // must be provided as-is to the corresponding backward iteration. The caller must // provide a new memory region for each iteration. // tmp_Wx: [N,H*3] additional temporary work space required for this iteration. The caller // should not use the contents of this vector, and must provide a new memory region for // each iteration. // tmp_Rh: [N,H*3] additional temporary work space required for this iteration. The caller // should not use the contents of this vector. The same memory region may be provided // for each iteration. // zoneout_prob: 0.0 <= zoneout_prob <= 1.0; specifies the probability of a hidden // activation being randomly zoned out. If zoneout was used during training, this // parameter must also be specified during inference with the same value. // zoneout_mask: [N,H] may be null to disable zoneout. This is a random binary mask // following a Bernoulli(1-zoneout_prob) distribution. A different mask is typically // used for each iteration. void Iterate( const T* W, const T* R, const T* bx, const T* br, const T* x, const T* h, T* h_out, T* v, T* tmp_Wx, T* tmp_Rh, const float zoneout_prob, const T* zoneout_mask); void Run( const int steps, const T* W, const T* R, const T* bx, const T* br, const T* x, T* h, T* v, T* tmp_Wx, T* tmp_Rh, const float zoneout_prob, const T* zoneout_mask); private: void IterateInternal( const T* R, const T* bx, const T* br, const T* h, T* h_out, T* v, T* tmp_Wx, T* tmp_Rh, const float zoneout_prob, const T* zoneout_mask); struct private_data; private_data* data_; }; template class BackwardPass { public: // batch_size: the number of training inputs provided in each tensor. // input_size: the dimension of each input vector. // hidden_size: the expected dimension of each output vector. // blas_handle: an initialized cuBLAS handle (see `cublasCreate`). BackwardPass( const int batch_size, const int input_size, const int hidden_size, const cublasHandle_t& blas_handle, const cudaStream_t& stream = 0); // Releases internal resources. // Blocks until all iterations have completed executing on the GPU. ~BackwardPass(); // Performs one backward iteration of the GRU cell. // // Note that BackwardPass must be iterated in the reverse order as ForwardPass. // If ForwardPass iterates from 0 to T-1, BackwardPass needs to iterate from // T-1 down to 0. When iteration numbers are described, they will be based on the // iteration index (i.e., the T-1'th iteration of the forward pass is the last call // to ForwardPass::Iterate, whereas it is the first call to BackwardPass::Iterate). // // W_t: [H*3,C] the transpose of the input weight matrix. // R_t: [H*3,H] the transpose of the recurrent weight matrix. // bx: [H*3] the bias vector for the input weight matrix. // br: [H*3] the bias vector for the recurrent weight matrix. // x_t: [C,N] the transpose of the GRU input for this iteration. // h: [N,H] the t-1 iteration's `h_out` or the initial hidden state if this is the t=0 // iteration (typically zeros). // v: [N,H*4] the same vector as returned by ForwardPass::Iterate on its corresponding // iteration. // dh_new: [N,H] the gradient of `h_out` with respect to the loss at this iteration. // dx: [N,C] the gradient of the input at this time step with respect to the loss. // dW: [C,H*3] the gradient of the input weight matrix with respect to the loss. // dR: [H,H*3] the gradient of the recurrent weight matrix with respect to the loss. // dbx: [H*3] the gradient of the bias vector for the input weight matrix with respect to // the loss. // dbr: [H*3] the gradient of the bias vector for the recurrent weight matrix with respect // to the loss. // dh: [N,H] NOTE: this is an input and output parameter. Should be initialized to zeros // for the T-1'th iteration and the same pointer should be passed in for each // iteration. After a complete backward pass, this vector will contain the gradient // of the initial hidden state with respect to the loss. // dp: [N,H*3] additional temporary work space required for this iteration. The caller // should not use the contents of this vector. A new memory region must be provided // for each iteration. // dq: [N,H*3] additional temporary work space required for this iteration. The caller // should not use the contents of this vector. A new memory region must be provided // for each iteration. // zoneout_mask: [N,H] may be null if zoneout was disabled in the forward pass. This vector // must be the same as the one provided during the corresponding forward iteration. void Iterate( const T* W_t, const T* R_t, const T* bx, const T* br, const T* x_t, const T* h, const T* v, const T* dh_new, T* dx, T* dW, T* dR, T* dbx, T* dbr, T* dh, T* dp, T* dq, const T* zoneout_mask); void Run( const int steps, const T* W_t, const T* R_t, const T* bx, const T* br, const T* x_t, const T* h, const T* v, const T* dh_new, T* dx, T* dW, T* dR, T* dbx, T* dbr, T* dh, T* dp, T* dq, const T* zoneout_mask); private: void IterateInternal( const T* R_t, const T* h, const T* v, const T* dh_new, T* dbx, T* dbr, T* dh, T* dp, T* dq, const T* zoneout_mask); struct private_data; private_data* data_; }; } // namespace gru } // namespace v0 } // namespace haste ================================================ FILE: lib/haste/indrnn.h ================================================ // Copyright 2020 LMNT, Inc. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // ============================================================================== #pragma once #include #include namespace haste { namespace v0 { namespace indrnn { template class ForwardPass { public: ForwardPass( const bool training, const int batch_size, const int input_size, const int hidden_size, const cublasHandle_t& blas_handle, const cudaStream_t& stream = 0); ~ForwardPass(); void Run( const int steps, const T* W, const T* u, const T* b, const T* x, T* h, T* workspace, const float zoneout_prob, const T* zoneout_mask); private: struct private_data; private_data* data_; }; template class BackwardPass { public: BackwardPass( const int batch_size, const int input_size, const int hidden_size, const cublasHandle_t& blas_handle, const cudaStream_t& stream = 0); ~BackwardPass(); void Run( const int steps, const T* W_t, const T* u, const T* b, const T* x_t, const T* h, const T* dh_new, T* dx, T* dW, T* du, T* db, T* dh, T* workspace, const T* zoneout_mask); private: struct private_data; private_data* data_; }; } // namespace indrnn } // namespace v0 } // namespace haste ================================================ FILE: lib/haste/layer_norm.h ================================================ // Copyright 2020 LMNT, Inc. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // ============================================================================== #pragma once #include namespace haste { namespace v0 { namespace layer_norm { template class ForwardPass { public: // gamma: [H] // beta: [H] // cache: [N,2] ForwardPass( const int batch_size, const int hidden_size, const T* gamma, const T* beta, T* cache); // Computes the layer norm of an input tensor `x` over its innermost (fastest changing) // dimension. The layer norm is defined as: \(\frac{x-\mu}{\sigma} \gamma + \beta\) // where `\gamma` and `\beta` are trainable parameters. // // x: [N,H] // y: [N,H] void Run(const cudaStream_t& stream, const T* x, T* y); void RunPartial( const cudaStream_t& stream, const int minibatch, const T* x, T* y); private: const int batch_size_; const int hidden_size_; const T* gamma_; const T* beta_; T* cache_; int partial_; }; template class BackwardPass { public: BackwardPass( const int batch_size, const int hidden_size, const T* gamma, const T* beta, const T* x, T* dgamma, T* dbeta, T* cache); void Run(const cudaStream_t& stream, const T* dy, T* dx); void RunPartial( const cudaStream_t& stream, const int minibatch, const T* dy, T* dx); private: const int batch_size_; const int hidden_size_; const T* gamma_; const T* beta_; const T* x_; T* dgamma_; T* dbeta_; T* cache_; int partial_; }; } // namespace layer_norm } // namespace v0 } // namespace haste ================================================ FILE: lib/haste/layer_norm_gru.h ================================================ // Copyright 2020 LMNT, Inc. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // ============================================================================== #pragma once #include #include namespace haste { namespace v0 { namespace layer_norm_gru { template class ForwardPass { public: // training: `true` if the caller intends to perform a backward pass to compute gradients. // batch_size: the number of training/inference inputs provided in each tensor. // input_size: the dimension of each input vector. // hidden_size: the expected dimension of each output vector. // blas_handle: an initialized cuBLAS handle (see `cublasCreate`). ForwardPass( const bool training, const int batch_size, const int input_size, const int hidden_size, const cublasHandle_t& blas_handle, const cudaStream_t& stream = 0); // Releases internal resources. // Blocks until all iterations have completed executing on the GPU. ~ForwardPass(); // Performs one forward iteration of the GRU cell. // // W: [C,H*3] the input weight matrix. // R: [H,H*3] the recurrent weight matrix. // bx: [H*3] the bias for the input weight matrix. // br: [H*3] the bias for the recurrent weight matrix. // x: [N,C] the GRU input for this iteration (N vectors, each with dimension C). // h: [N,H] the t-1 iteration's `h_out` or the initial hidden state if this is the // t=0 iteration (typically zeros). // h_out: [N,H] the GRU's output, and the input to the next iteration's `h`. This // pointer may be the same as `h`. Each iteration may reuse the same memory region. // v: [N,H*4] if `training` is `false`, this can be a null pointer. If `training` is // `true`, this vector will contain intermediate activations for this iteration which // must be provided as-is to the corresponding backward iteration. The caller must // provide a new memory region for each iteration. // tmp_Wx: [N,H*3] additional temporary work space required for this iteration. The caller // should not use the contents of this vector, and must provide a new memory region for // each iteration. // tmp_Rh: [N,H*3] additional temporary work space required for this iteration. The caller // should not use the contents of this vector. The same memory region may be provided // for each iteration. // zoneout_prob: 0.0 <= zoneout_prob <= 1.0; specifies the probability of a hidden // activation being randomly zoned out. If zoneout was used during training, this // parameter must also be specified during inference with the same value. // zoneout_mask: [N,H] may be null to disable zoneout. This is a random binary mask // following a Bernoulli(1-zoneout_prob) distribution. A different mask is typically // used for each iteration. void Run( const int steps, const T* W, const T* R, const T* bx, const T* br, const T* x, T* h, T* v, T* act_Wx, layer_norm::ForwardPass& layer_norm1, T* tmp_Wx_norm, T* act_Rh, layer_norm::ForwardPass& layer_norm2, T* tmp_Rh_norm, const float zoneout_prob, const T* zoneout_mask); private: void IterateInternal( const T* R, const T* bx, const T* br, const T* h, T* h_out, T* v, T* tmp_Wx_norm, T* act_Rh, layer_norm::ForwardPass& layer_norm2, T* tmp_Rh_norm, const float zoneout_prob, const T* zoneout_mask); struct private_data; private_data* data_; }; template class BackwardPass { public: // batch_size: the number of training inputs provided in each tensor. // input_size: the dimension of each input vector. // hidden_size: the expected dimension of each output vector. // blas_handle: an initialized cuBLAS handle (see `cublasCreate`). BackwardPass( const int batch_size, const int input_size, const int hidden_size, const cublasHandle_t& blas_handle, const cudaStream_t& stream = 0); // Releases internal resources. // Blocks until all iterations have completed executing on the GPU. ~BackwardPass(); // Performs one backward iteration of the GRU cell. // // Note that BackwardPass must be iterated in the reverse order as ForwardPass. // If ForwardPass iterates from 0 to T-1, BackwardPass needs to iterate from // T-1 down to 0. When iteration numbers are described, they will be based on the // iteration index (i.e., the T-1'th iteration of the forward pass is the last call // to ForwardPass::Iterate, whereas it is the first call to BackwardPass::Iterate). // // W_t: [H*3,C] the transpose of the input weight matrix. // R_t: [H*3,H] the transpose of the recurrent weight matrix. // bx: [H*3] the bias vector for the input weight matrix. // br: [H*3] the bias vector for the recurrent weight matrix. // x_t: [C,N] the transpose of the GRU input for this iteration. // h: [N,H] the t-1 iteration's `h_out` or the initial hidden state if this is the t=0 // iteration (typically zeros). // v: [N,H*4] the same vector as returned by ForwardPass::Iterate on its corresponding // iteration. // dh_new: [N,H] the gradient of `h_out` with respect to the loss at this iteration. // dx: [N,C] the gradient of the input at this time step with respect to the loss. // dW: [C,H*3] the gradient of the input weight matrix with respect to the loss. // dR: [H,H*3] the gradient of the recurrent weight matrix with respect to the loss. // dbx: [H*3] the gradient of the bias vector for the input weight matrix with respect to // the loss. // dbr: [H*3] the gradient of the bias vector for the recurrent weight matrix with respect // to the loss. // dh: [N,H] NOTE: this is an input and output parameter. Should be initialized to zeros // for the T-1'th iteration and the same pointer should be passed in for each // iteration. After a complete backward pass, this vector will contain the gradient // of the initial hidden state with respect to the loss. // dp: [N,H*3] additional temporary work space required for this iteration. The caller // should not use the contents of this vector. A new memory region must be provided // for each iteration. // dq: [N,H*3] additional temporary work space required for this iteration. The caller // should not use the contents of this vector. A new memory region must be provided // for each iteration. // zoneout_mask: [N,H] may be null if zoneout was disabled in the forward pass. This vector // must be the same as the one provided during the corresponding forward iteration. void Run( const int steps, const T* W_t, const T* R_t, const T* bx, const T* br, const T* x_t, const T* h, const T* v, const T* dh_new, T* dx, T* dW, T* dR, T* dbx, T* dbr, T* dh, T* dp, T* dq, layer_norm::BackwardPass& layer_norm1, layer_norm::BackwardPass& layer_norm2, const T* zoneout_mask); private: void IterateInternal( const T* R_t, const T* h, const T* v, const T* dh_new, T* dbx, T* dbr, T* dh, T* dp, T* dq, layer_norm::BackwardPass& layer_norm2, const T* zoneout_mask); struct private_data; private_data* data_; }; } // namespace layer_norm_gru } // namespace v0 } // namespace haste ================================================ FILE: lib/haste/layer_norm_indrnn.h ================================================ // Copyright 2020 LMNT, Inc. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // ============================================================================== #pragma once #include #include namespace haste { namespace v0 { namespace layer_norm_indrnn { template class ForwardPass { public: ForwardPass( const bool training, const int batch_size, const int input_size, const int hidden_size, const cublasHandle_t& blas_handle, const cudaStream_t& stream = 0); ~ForwardPass(); void Run( const int steps, const T* W, const T* u, const T* b, const T* x, T* h, T* workspace, T* act_Wx, layer_norm::ForwardPass& layer_norm1, const float zoneout_prob, const T* zoneout_mask); private: struct private_data; private_data* data_; }; template class BackwardPass { public: BackwardPass( const int batch_size, const int input_size, const int hidden_size, const cublasHandle_t& blas_handle, const cudaStream_t& stream = 0); ~BackwardPass(); void Run( const int steps, const T* W_t, const T* u, const T* b, const T* x_t, const T* h, const T* dh_new, T* dx, T* dW, T* du, T* db, T* dh, T* workspace, layer_norm::BackwardPass& layer_norm1, const T* zoneout_mask); private: struct private_data; private_data* data_; }; } // namespace layer_norm_indrnn } // namespace v0 } // namespace haste ================================================ FILE: lib/haste/layer_norm_lstm.h ================================================ // Copyright 2020 LMNT, Inc. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // ============================================================================== #pragma once #include #include namespace haste { namespace v0 { namespace layer_norm_lstm { template class ForwardPass { public: // training: `true` if the caller intends to perform a backward pass to compute gradients. // batch_size: the number of training/inference inputs provided in each tensor. // input_size: the dimension of each input vector. // hidden_size: the expected dimension of each output vector. // blas_handle: an initialized cuBLAS handle (see `cublasCreate`). ForwardPass( const bool training, const int batch_size, const int input_size, const int hidden_size, const cublasHandle_t& blas_handle, const cudaStream_t& stream = 0); // Releases internal resources. // Blocks until all iterations have completed executing on the GPU. ~ForwardPass(); // Runs the LSTM over all time steps. This method is faster than using a per-step // `Iterate` but requires that the entire input sequence be available upfront. In some // situations, this constraint may not be satisfiable (e.g. autoregressive models). // Users should prefer calling `Run` over `Iterate` whenever possible. // // steps: the number of iterations to run (i.e. T). // W: [C,H*4] the input weight matrix. // R: [H,H*4] the recurrent weight matrix. // b: [H*4] the bias vector. // x: [T,N,C] the LSTM input for this iteration (N vectors, each with dimension C). // h: [T+1,N,H] the hidden state vectors across all time steps. The t=0'th vector should // be set to the desired initial hidden state (typically zeros). The rest of the // vectors will be set by this function. `h[1:,:,:]` forms the output of this LSTM // layer. // c: [T+1,N,H] the cell state vectors across all time steps. The t=0'th vector should be // set to the desired initial cell state (typically zeros). The rest of the vectors // will be set by this function. // v: [T,N,H*4] if `training` is `false`, this is scratch space and should not be used by // the caller. If `training` is `true`, this parameter will contain intermediate // activations which must be provided as-is to `BackwardPass::Run` or manually urolled // for `BackwardPass::Iterate`. // tmp_Rh: [N,H*4] additional temporary work space required for this iteration. The caller // should not use the contents of this vector. The same memory region may be provided // for each iteration. // zoneout_prob: 0.0 <= zoneout_prob <= 1.0; specifies the probability of a hidden // activation being randomly zoned out. If zoneout was used during training, this // parameter must also be specified during inference with the same value. // zoneout_mask: [T,N,H] may be null to disable zoneout. This is a random binary mask // following a Bernoulli(1-zoneout_prob) distribution. A different mask is typically // used for each iteration. void Run( const int steps, const T* W, const T* R, const T* b, const T* x, T* h, T* c, T* act_Wx, T* tmp_Rh, layer_norm::ForwardPass& layer_norm1, T* act_Wx_norm, T* act_Rh, layer_norm::ForwardPass& layer_norm2, layer_norm::ForwardPass& layer_norm3, T* act_c_norm, const float zoneout_prob, const T* zoneout_mask); private: void IterateInternal( const T* R, const T* b, const T* h, const T* c, T* h_out, T* c_out, T* v, T* tmp_Rh, T* act_Rh, layer_norm::ForwardPass& layer_norm2, layer_norm::ForwardPass& layer_norm3, T* act_c_norm, const float zoneout_prob, const T* zoneout_mask); struct private_data; private_data* data_; }; template class BackwardPass { public: // batch_size: the number of training inputs provided in each tensor. // input_size: the dimension of each input vector. // hidden_size: the expected dimension of each output vector. // blas_handle: an initialized cuBLAS handle (see `cublasCreate`). BackwardPass( const int batch_size, const int input_size, const int hidden_size, const cublasHandle_t& blas_handle, const cudaStream_t& stream = 0); // Releases internal resources. // Blocks until all iterations have completed executing on the GPU. ~BackwardPass(); // Runs the LSTM backward pass over all time steps. This method is faster than using a // per-step `Iterate` but requires that the entire input sequence be available upfront. // In some situations, this constraint may not be satisfiable (e.g. autoregressive models). // Users should prefer calling `Run` over `Iterate` whenever possible. // // steps: the number of iterations to run (i.e. T). // W_t: [H*4,C] the transpose of the input weight matrix. // R_t: [H*4,H] the transpose of the recurrent weight matrix. // b: [H*4] the bias vector. // x_t: [C,T,N] the transpose of the LSTM input for this iteration. // h: [T+1,N,H] the hidden state vectors after running `ForwardPass::Run`. // c: [T+1,N,H] the cell state vectors after running `ForwardPass::Run`. // dh_new: [T+1,N,H] the gradient of the loss with respect to `h`. // dc_new: [T+1,N,H] the gradient of the loss with respect to `c`. // dx: [T,N,C] the gradient of the loss with respect to the input. // dW: [C,H*4] the gradient of the loss with respect to the input weight matrix. // dR: [H,H*4] the gradient of the loss with respect to the recurrent weight matrix. // db: [H*4] the gradient of the loss with respect to the bias vector. // dh: [N,H] NOTE: this is an input and output parameter. Should be initialized to zeros. // When this function returns, `dh` will contain the gradient of the loss with respect // to the initial hidden state. // dc: [N,H] NOTE: this is an input and output parameter. Should be initialized to zeros. // When this function returns, `dc` will contain the gradient of the loss with respect // to the initial cell state. // v: [T,N,H*4] the same tensor that was passed to `ForwardPass::Run`. // zoneout_mask: [T,N,H] may be null if zoneout was disabled in the forward pass. This // vector must be the same as the one provided during the forward pass. void Run( const int steps, const T* W_t, const T* R_t, const T* b, const T* x_t, const T* h, const T* c, const T* dh_new, const T* dc_new, T* dx, T* dW, T* dR, T* db, T* dh, T* dc, T* act_Wx, layer_norm::BackwardPass& layer_norm1, T* act_Wx_norm, T* act_Rh, layer_norm::BackwardPass& layer_norm2, layer_norm::BackwardPass& layer_norm3, T* act_c_norm, const T* zoneout_mask); private: void IterateInternal( const T* R_t, const T* c, const T* c_new, const T* dh_new, const T* dc_new, T* db, T* dh, T* dc, T* v, T* act_Rh, layer_norm::BackwardPass& layer_norm2, layer_norm::BackwardPass& layer_norm3, T* act_c_norm, const T* zoneout_mask); struct private_data; private_data* data_; }; } // namespace layer_norm_lstm } // namespace v0 } // namespace haste ================================================ FILE: lib/haste/lstm.h ================================================ // Copyright 2020 LMNT, Inc. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // ============================================================================== #pragma once #include #include namespace haste { namespace v0 { namespace lstm { template class ForwardPass { public: // training: `true` if the caller intends to perform a backward pass to compute gradients. // batch_size: the number of training/inference inputs provided in each tensor. // input_size: the dimension of each input vector. // hidden_size: the expected dimension of each output vector. // blas_handle: an initialized cuBLAS handle (see `cublasCreate`). ForwardPass( const bool training, const int batch_size, const int input_size, const int hidden_size, const cublasHandle_t& blas_handle, const cudaStream_t& stream = 0); // Releases internal resources. // Blocks until all iterations have completed executing on the GPU. ~ForwardPass(); // Performs one forward iteration of the LSTM cell. // // W: [C,H*4] the input weight matrix. // R: [H,H*4] the recurrent weight matrix. // b: [H*4] the bias vector. // x: [N,C] the LSTM input for this iteration (N vectors, each with dimension C). // h: [N,H] the t-1 iteration's `h_out` or the initial hidden state if this is the // t=0 iteration (typically zeros). // c: [N,H] the t-1 iteration's `c_out` or the initial cell state if this is the // t=0 iteration (typically zeros). // h_out: [N,H] the LSTM's output, and the input to the next iteration's `h`. This // pointer may be the same as `h`. Each iteration may reuse the same memory region. // c_out: [N,H] the LSTM's internal cell state after this iteration is complete. This // will become the input to the next iteration's `c`. // v: [N,H*4] if `training` is `false`, this is scratch space and should not be used by // the caller. If `training` is `true`, this vector will contain intermediate // activations for this iteration which must be provided as-is to the corresponding // backward iteration. In either case, a new memory region must be provided for each // iteration. // tmp_Rh: [N,H*4] additional temporary work space required for this iteration. The caller // should not use the contents of this vector. The same memory region may be provided // for each iteration. // zoneout_prob: 0.0 <= zoneout_prob <= 1.0; specifies the probability of a hidden // activation being randomly zoned out. If zoneout was used during training, this // parameter must also be specified during inference with the same value. // zoneout_mask: [N,H] may be null to disable zoneout. This is a random binary mask // following a Bernoulli(1-zoneout_prob) distribution. A different mask is typically // used for each iteration. void Iterate( const cudaStream_t& stream, const T* W, const T* R, const T* b, const T* x, const T* h, const T* c, T* h_out, T* c_out, T* v, T* tmp_Rh, const float zoneout_prob, const T* zoneout_mask); // Runs the LSTM over all time steps. This method is faster than using a per-step // `Iterate` but requires that the entire input sequence be available upfront. In some // situations, this constraint may not be satisfiable (e.g. autoregressive models). // Users should prefer calling `Run` over `Iterate` whenever possible. // // steps: the number of iterations to run (i.e. T). // W: [C,H*4] the input weight matrix. // R: [H,H*4] the recurrent weight matrix. // b: [H*4] the bias vector. // x: [T,N,C] the LSTM input for this iteration (N vectors, each with dimension C). // h: [T+1,N,H] the hidden state vectors across all time steps. The t=0'th vector should // be set to the desired initial hidden state (typically zeros). The rest of the // vectors will be set by this function. `h[1:,:,:]` forms the output of this LSTM // layer. // c: [T+1,N,H] the cell state vectors across all time steps. The t=0'th vector should be // set to the desired initial cell state (typically zeros). The rest of the vectors // will be set by this function. // v: [T,N,H*4] if `training` is `false`, this is scratch space and should not be used by // the caller. If `training` is `true`, this parameter will contain intermediate // activations which must be provided as-is to `BackwardPass::Run` or manually urolled // for `BackwardPass::Iterate`. // tmp_Rh: [N,H*4] additional temporary work space required for this iteration. The caller // should not use the contents of this vector. The same memory region may be provided // for each iteration. // zoneout_prob: 0.0 <= zoneout_prob <= 1.0; specifies the probability of a hidden // activation being randomly zoned out. If zoneout was used during training, this // parameter must also be specified during inference with the same value. // zoneout_mask: [T,N,H] may be null to disable zoneout. This is a random binary mask // following a Bernoulli(1-zoneout_prob) distribution. A different mask is typically // used for each iteration. void Run( const int steps, const T* W, const T* R, const T* b, const T* x, T* h, T* c, T* v, T* tmp_Rh, const float zoneout_prob, const T* zoneout_mask); private: void IterateInternal( const T* R, const T* b, const T* h, const T* c, T* h_out, T* c_out, T* v, T* tmp_Rh, const float zoneout_prob, const T* zoneout_mask); struct private_data; private_data* data_; }; template class BackwardPass { public: // batch_size: the number of training inputs provided in each tensor. // input_size: the dimension of each input vector. // hidden_size: the expected dimension of each output vector. // blas_handle: an initialized cuBLAS handle (see `cublasCreate`). BackwardPass( const int batch_size, const int input_size, const int hidden_size, const cublasHandle_t& blas_handle, const cudaStream_t& stream = 0); // Releases internal resources. // Blocks until all iterations have completed executing on the GPU. ~BackwardPass(); // Performs one backward iteration of the LSTM cell. // // Note that BackwardPass must be iterated in the reverse order as ForwardPass. // If ForwardPass iterates from 0 to T-1, BackwardPass needs to iterate from // T-1 down to 0. When iteration numbers are described, they will be based on the // iteration index (i.e., the T-1'th iteration of the forward pass is the last call // to ForwardPass::Iterate, whereas it is the first call to BackwardPass::Iterate). // // W_t: [H*4,C] the transpose of the input weight matrix. // R_t: [H*4,H] the transpose of the recurrent weight matrix. // b: [H*4] the bias vector. // x_t: [C,N] the transpose of the LSTM input for this iteration. // h: [N,H] the hidden state of the t'th iteration or the initial hidden state if this is // the t=0 iteration (typically zeros). // c: [N,H] the t-1'th forward iteration's `c_out` or the initial cell state if this is // the t=0 iteration (typically zeros). // c_new: [N,H] the t'th forward iteration's `c_out` vector. // dh_new: [N,H] the gradient of the loss with respect to `h_out` at this iteration. // dc_new: [N,H] the gradient of the loss with respect to `c_out` at this iteration. // dx: [N,C] the gradient of the loss with respect to the input at this time step. // dW: [C,H*4] the gradient of the loss with respect to the input weight matrix. // dR: [H,H*4] the gradient of the loss with respect to the recurrent weight matrix. // db: [H*4] the gradient of the loss with respect to the bias vector. // dh: [N,H] NOTE: this is an input and output parameter. Should be initialized to zeros // for the T-1'th iteration and the same pointer should be passed in for each // iteration. After a complete backward pass, this vector will contain the gradient // of the loss with respect to the initial hidden state. // dc: [N,H] NOTE: this is an input and output parameter. Should be initialized to zeros // for the T-1'th iteration and the same pointer should be passed in for each // iteration. After a complete backward pass, this vector will contain the gradient // of the loss with respect to the initial cell state. // v: [N,H*4] the same tensor that was passed to `ForwardPass::Iterate` on its corresponding // iteration. // zoneout_mask: [N,H] may be null if zoneout was disabled in the forward pass. This vector // must be the same as the one provided during the corresponding forward iteration. void Iterate( const cudaStream_t& stream, const T* W_t, const T* R_t, const T* b, const T* x_t, const T* h, const T* c, const T* c_new, const T* dh_new, const T* dc_new, T* dx, T* dW, T* dR, T* db, T* dh, T* dc, T* v, const T* zoneout_mask); // Runs the LSTM backward pass over all time steps. This method is faster than using a // per-step `Iterate` but requires that the entire input sequence be available upfront. // In some situations, this constraint may not be satisfiable (e.g. autoregressive models). // Users should prefer calling `Run` over `Iterate` whenever possible. // // steps: the number of iterations to run (i.e. T). // W_t: [H*4,C] the transpose of the input weight matrix. // R_t: [H*4,H] the transpose of the recurrent weight matrix. // b: [H*4] the bias vector. // x_t: [C,T,N] the transpose of the LSTM input for this iteration. // h: [T+1,N,H] the hidden state vectors after running `ForwardPass::Run`. // c: [T+1,N,H] the cell state vectors after running `ForwardPass::Run`. // dh_new: [T+1,N,H] the gradient of the loss with respect to `h`. // dc_new: [T+1,N,H] the gradient of the loss with respect to `c`. // dx: [T,N,C] the gradient of the loss with respect to the input. // dW: [C,H*4] the gradient of the loss with respect to the input weight matrix. // dR: [H,H*4] the gradient of the loss with respect to the recurrent weight matrix. // db: [H*4] the gradient of the loss with respect to the bias vector. // dh: [N,H] NOTE: this is an input and output parameter. Should be initialized to zeros. // When this function returns, `dh` will contain the gradient of the loss with respect // to the initial hidden state. // dc: [N,H] NOTE: this is an input and output parameter. Should be initialized to zeros. // When this function returns, `dc` will contain the gradient of the loss with respect // to the initial cell state. // v: [T,N,H*4] the same tensor that was passed to `ForwardPass::Run`. // zoneout_mask: [T,N,H] may be null if zoneout was disabled in the forward pass. This // vector must be the same as the one provided during the forward pass. void Run( const int steps, const T* W_t, const T* R_t, const T* b, const T* x_t, const T* h, const T* c, const T* dh_new, const T* dc_new, T* dx, T* dW, T* dR, T* db, T* dh, T* dc, T* v, const T* zoneout_mask); private: void IterateInternal( const T* R_t, const T* c, const T* c_new, const T* dh_new, const T* dc_new, T* db, T* dh, T* dc, T* v, const T* zoneout_mask); struct private_data; private_data* data_; }; } // namespace lstm } // namespace v0 } // namespace haste ================================================ FILE: lib/haste.h ================================================ // Copyright 2020 LMNT, Inc. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // ============================================================================== #pragma once // GENERAL NOTES: // No pointers may be null unless otherwise specified. // All pointers are expected to point to device memory. // The square brackets below describe tensor shapes, where // T = number of RNN time steps // N = batch size // C = input size // H = hidden size // and the rightmost dimension changes the fastest. #include "haste/gru.h" #include "haste/indrnn.h" #include "haste/layer_norm.h" #include "haste/layer_norm_gru.h" #include "haste/layer_norm_indrnn.h" #include "haste/layer_norm_lstm.h" #include "haste/lstm.h" ================================================ FILE: lib/indrnn_backward_gpu.cu.cc ================================================ // Copyright 2020 LMNT, Inc. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // ============================================================================== #include #include #include "blas.h" #include "haste.h" #include "inline_ops.h" namespace { template __global__ void IndrnnBwdOps( const int steps, const int batch_size, const int hidden_size, const T* u, const T* h_prev, const T* h, const T* dh_new, T* du_out, T* db_out, T* dh_inout, T* dk_out, const T* zoneout_mask) { const int row = blockDim.x * blockIdx.x + threadIdx.x; const int col = blockDim.y * blockIdx.y + threadIdx.y; if (row >= hidden_size || col >= batch_size) return; const int NH = batch_size * hidden_size; const int idx = col * hidden_size + row; const T u_row = u[row]; T dh_inout_idx = dh_inout[idx]; T du_sum = static_cast(0.0); T db_sum = static_cast(0.0); for (int i = (steps - 1) * NH; i >= 0; i -= NH) { T dh_total = dh_new[idx + i] + dh_inout_idx; T dh = static_cast(0.0); if (ApplyZoneout) { const T mask = zoneout_mask[idx + i]; dh = (static_cast(1.0) - mask) * dh_total; dh_total = mask * dh_total; } const T dk = d_tanh(h[idx + i]) * dh_total; dk_out[idx + i] = dk; dh_inout_idx = dh + u_row * dk; du_sum += h_prev[idx + i] * dk; db_sum += dk; } dh_inout[idx] = dh_inout_idx; atomicAdd(&du_out[row], du_sum); atomicAdd(&db_out[row], db_sum); } } // anonymous namespace namespace haste { namespace v0 { namespace indrnn { template struct BackwardPass::private_data { int batch_size; int input_size; int hidden_size; cublasHandle_t blas_handle; cudaStream_t stream; cudaStream_t sync_stream; }; template BackwardPass::BackwardPass( const int batch_size, const int input_size, const int hidden_size, const cublasHandle_t& blas_handle, const cudaStream_t& stream) : data_(new private_data) { data_->batch_size = batch_size; data_->input_size = input_size; data_->hidden_size = hidden_size; data_->blas_handle = blas_handle; data_->sync_stream = stream; cudaStreamCreate(&data_->stream); } template BackwardPass::~BackwardPass() { if (data_->sync_stream) { cudaEvent_t event; cudaEventCreateWithFlags(&event, cudaEventDisableTiming); cudaEventRecord(event, data_->stream); cudaStreamWaitEvent(data_->sync_stream, event, 0); cudaEventDestroy(event); } else { cudaStreamSynchronize(data_->stream); } cudaStreamDestroy(data_->stream); delete data_; } template void BackwardPass::Run( const int steps, const T* W_t, const T* u, const T* b, const T* x_t, const T* h, const T* dh_new, T* dx, T* dW, T* du, T* db, T* dh, T* workspace, const T* zoneout_mask) { const T alpha = static_cast(1.0); const T beta = static_cast(0.0); const blas::set_pointer_mode scoped1(data_->blas_handle); const int batch_size = data_->batch_size; const int input_size = data_->input_size; const int hidden_size = data_->hidden_size; const cublasHandle_t blas_handle = data_->blas_handle; const cudaStream_t stream = data_->stream; const dim3 blockDim(64, 16); const dim3 gridDim( (hidden_size + blockDim.x - 1) / blockDim.x, (batch_size + blockDim.y - 1) / blockDim.y); const int NH = batch_size * hidden_size; if (zoneout_mask) { IndrnnBwdOps<<>>( steps, batch_size, hidden_size, u, h, h + NH, dh_new + NH, du, db, dh, workspace, zoneout_mask); } else { IndrnnBwdOps<<>>( steps, batch_size, hidden_size, u, h, h + NH, dh_new + NH, du, db, dh, workspace, nullptr); } cudaStream_t save_stream; cublasGetStream(blas_handle, &save_stream); cublasSetStream(blas_handle, stream); blas::gemm(blas_handle, CUBLAS_OP_N, CUBLAS_OP_N, hidden_size, input_size, batch_size * steps, &alpha, workspace, hidden_size, x_t, batch_size * steps, &beta, dW, hidden_size); blas::gemm(blas_handle, CUBLAS_OP_N, CUBLAS_OP_N, input_size, steps * batch_size, hidden_size, &alpha, W_t, input_size, workspace, hidden_size, &beta, dx, input_size); cublasSetStream(blas_handle, save_stream); } template class BackwardPass; template class BackwardPass; } // namespace indrnn } // namespace v0 } // namespace haste ================================================ FILE: lib/indrnn_forward_gpu.cu.cc ================================================ // Copyright 2020 LMNT, Inc. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // ============================================================================== #include #include #include "blas.h" #include "haste.h" #include "inline_ops.h" namespace { template __global__ void IndrnnFwdOps( const int steps, const int batch_size, const int hidden_size, const T* Wx, const T* u, const T* b, const T* h, T* h_out, const float zoneout_prob, const T* zoneout_mask) { const int row = blockDim.x * blockIdx.x + threadIdx.x; const int col = blockDim.y * blockIdx.y + threadIdx.y; if (row >= hidden_size || col >= batch_size) return; const int idx = col * hidden_size + row; const int NH = batch_size * hidden_size; const T u_row = u[row]; const T b_row = b[row]; for (int i = 0; i < steps * NH; i += NH) { const T a = Wx[idx + i] + u_row * h[idx + i] + b_row; T cur_h_value = tanh(a); if (ApplyZoneout) { if (Training) { cur_h_value = (cur_h_value - h[idx + i]) * zoneout_mask[idx + i] + h[idx + i]; } else { cur_h_value = (zoneout_prob * h[idx + i]) + ((1.0f - zoneout_prob) * cur_h_value); } } h_out[idx + i] = cur_h_value; } } } // anonymous namespace namespace haste { namespace v0 { namespace indrnn { template struct ForwardPass::private_data { bool training; int batch_size; int input_size; int hidden_size; cublasHandle_t blas_handle; cudaStream_t stream; cudaStream_t sync_stream; }; template ForwardPass::ForwardPass( const bool training, const int batch_size, const int input_size, const int hidden_size, const cublasHandle_t& blas_handle, const cudaStream_t& stream) : data_(new private_data) { data_->training = training; data_->batch_size = batch_size; data_->input_size = input_size; data_->hidden_size = hidden_size; data_->blas_handle = blas_handle; data_->sync_stream = stream; cudaStreamCreate(&data_->stream); } template ForwardPass::~ForwardPass() { if (data_->sync_stream) { cudaEvent_t event; cudaEventCreateWithFlags(&event, cudaEventDisableTiming); cudaEventRecord(event, data_->stream); cudaStreamWaitEvent(data_->sync_stream, event, 0); cudaEventDestroy(event); } else { cudaStreamSynchronize(data_->stream); } cudaStreamDestroy(data_->stream); delete data_; } template void ForwardPass::Run( const int steps, const T* W, const T* u, const T* b, const T* x, T* h, T* workspace, const float zoneout_prob, const T* zoneout_mask) { static const T alpha = static_cast(1.0); static const T beta = static_cast(0.0); const blas::set_pointer_mode scoped1(data_->blas_handle); const bool training = data_->training; const int batch_size = data_->batch_size; const int input_size = data_->input_size; const int hidden_size = data_->hidden_size; const cublasHandle_t blas_handle = data_->blas_handle; const cudaStream_t stream = data_->stream; cudaStream_t save_stream; cublasGetStream(blas_handle, &save_stream); cublasSetStream(blas_handle, stream); blas::gemm(blas_handle, CUBLAS_OP_N, CUBLAS_OP_N, hidden_size, steps * batch_size, input_size, &alpha, W, hidden_size, x, input_size, &beta, workspace, hidden_size); const dim3 blockDim(64, 16); const dim3 gridDim( (hidden_size + blockDim.x - 1) / blockDim.x, (batch_size + blockDim.y - 1) / blockDim.y); const int NH = batch_size * hidden_size; if (training) { if (zoneout_prob && zoneout_mask) { IndrnnFwdOps<<>>( steps, batch_size, hidden_size, workspace, u, b, h, h + NH, zoneout_prob, zoneout_mask); } else { IndrnnFwdOps<<>>( steps, batch_size, hidden_size, workspace, u, b, h, h + NH, 0.0f, nullptr); } } else { if (zoneout_prob && zoneout_mask) { IndrnnFwdOps<<>>( steps, batch_size, hidden_size, workspace, u, b, h, h + NH, zoneout_prob, zoneout_mask); } else { IndrnnFwdOps<<>>( steps, batch_size, hidden_size, workspace, u, b, h, h + NH, 0.0f, nullptr); } } cublasSetStream(blas_handle, save_stream); } template class ForwardPass; template class ForwardPass; } // namespace indrnn } // namespace v0 } // namespace haste ================================================ FILE: lib/inline_ops.h ================================================ // Copyright 2020 LMNT, Inc. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // ============================================================================== #pragma once #include template __device__ __forceinline__ T sigmoid(const T x) { return static_cast(1.0) / (static_cast(1.0) + exp(-x)); } template __device__ __forceinline__ T tanh(const T x) { return std::tanh(x); } template __device__ __forceinline__ T d_sigmoid(const T sigmoid_output) { return sigmoid_output * (static_cast(1.0) - sigmoid_output); } template __device__ __forceinline__ T d_tanh(const T tanh_output) { return (static_cast(1.0) - tanh_output * tanh_output); } #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 600) __device__ __forceinline__ double atomicAdd(double* address, double val) { unsigned long long int* address_as_ull = (unsigned long long int*)address; unsigned long long int old = *address_as_ull, assumed; do { assumed = old; old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val + __longlong_as_double(assumed))); } while (assumed != old); return __longlong_as_double(old); } #endif #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 600) template<> __device__ __forceinline__ half sigmoid(const half x) { return static_cast(1.0) / (static_cast(1.0) + hexp(-x)); } template<> __device__ __forceinline__ half tanh(const half x) { return std::tanh(float(x)); } #endif ================================================ FILE: lib/layer_norm_backward_gpu.cu.cc ================================================ // Copyright 2020 LMNT, Inc. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // ============================================================================== #include #include "haste.h" #include "inline_ops.h" namespace { template __global__ void LayerNormGrad( const int batch_size, const int hidden_size, const T* gamma, const T* x, const T* dy, T* dgamma, T* dbeta, T* dx, T* cache) { const int batch = blockDim.x * blockIdx.x + threadIdx.x; if (batch >= batch_size) return; extern __shared__ int shared_var[]; T* shared = reinterpret_cast(shared_var); const int index = threadIdx.y; const int stride = blockDim.y; const int batch_idx = batch * hidden_size; const int batch_block_idx = threadIdx.x * stride * 3; const T mean = cache[batch * 2 + 0]; const T invstd = cache[batch * 2 + 1]; T dsigma_tmp = static_cast(0.0); T dmu1_tmp = static_cast(0.0); T dmu2_tmp = static_cast(0.0); for (int i = index; i < hidden_size; i += stride) { const T cur_dy = dy[batch_idx + i]; const T centered_x = x[batch_idx + i] - mean; const T z = centered_x * invstd; atomicAdd(&dgamma[i], z * cur_dy); if (ApplyBeta) atomicAdd(&dbeta[i], cur_dy); const T db = gamma[i] * cur_dy; dsigma_tmp += centered_x * db; dmu1_tmp += centered_x; dmu2_tmp += db; } shared[batch_block_idx + index * 3 + 0] = dsigma_tmp; shared[batch_block_idx + index * 3 + 1] = dmu1_tmp; shared[batch_block_idx + index * 3 + 2] = dmu2_tmp; __syncthreads(); for (int s = stride / 2; s > 0; s >>= 1) { if (index < s) { shared[batch_block_idx + index * 3 + 0] += shared[batch_block_idx + (index + s) * 3 + 0]; shared[batch_block_idx + index * 3 + 1] += shared[batch_block_idx + (index + s) * 3 + 1]; shared[batch_block_idx + index * 3 + 2] += shared[batch_block_idx + (index + s) * 3 + 2]; } __syncthreads(); } const T dsigma = static_cast(-0.5) * shared[batch_block_idx + 0] * invstd * invstd * invstd; const T dmu = (static_cast(-2.0) * shared[batch_block_idx + 1] * dsigma / hidden_size) - (shared[batch_block_idx + 2] * invstd); for (int i = index; i < hidden_size; i += stride) { const T cur_dy = dy[batch_idx + i]; const T centered_x = x[batch_idx + i] - mean; const T db = gamma[i] * cur_dy; dx[batch_idx + i] = (static_cast(2.0) * centered_x * dsigma / hidden_size) + (invstd * db) + (dmu / hidden_size); } } } // anonymous namespace namespace haste { namespace v0 { namespace layer_norm { template BackwardPass::BackwardPass( const int batch_size, const int hidden_size, const T* gamma, const T* beta, const T* x, T* dgamma, T* dbeta, T* cache) : batch_size_(batch_size), hidden_size_(hidden_size), gamma_(gamma), beta_(beta), x_(x), dgamma_(dgamma), dbeta_(dbeta), cache_(cache), partial_(batch_size) { } template void BackwardPass::Run(const cudaStream_t& stream, const T* dy, T* dx) { RunPartial(stream, batch_size_, dy, dx); } template void BackwardPass::RunPartial( const cudaStream_t& stream, const int minibatch, const T* dy, T* dx) { assert(partial_ - minibatch >= 0); dim3 blockDim(4, 256); dim3 gridDim; gridDim.x = (minibatch + blockDim.x - 1) / blockDim.x; const int shared_mem_size = sizeof(T) * blockDim.x * blockDim.y * 3; if (beta_ && dbeta_) { LayerNormGrad<<>>( minibatch, hidden_size_, gamma_, x_ + (partial_ - minibatch) * hidden_size_, dy, dgamma_, dbeta_, dx, cache_ + (partial_ - minibatch) * 2); } else { LayerNormGrad<<>>( minibatch, hidden_size_, gamma_, x_ + (partial_ - minibatch) * hidden_size_, dy, dgamma_, nullptr, dx, cache_ + (partial_ - minibatch) * 2); } partial_ -= minibatch; } template class BackwardPass; template class BackwardPass; } // namespace layer_norm } // namespace v0 } // namespace haste ================================================ FILE: lib/layer_norm_forward_gpu.cu.cc ================================================ // Copyright 2020 LMNT, Inc. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // ============================================================================== #include #include "haste.h" namespace { template __global__ void LayerNorm( const int batch_size, const int hidden_size, const T* gamma, const T* beta, const T* x, T* y, T* cache) { const int batch = blockDim.x * blockIdx.x + threadIdx.x; if (batch >= batch_size) return; extern __shared__ int shared_var[]; T* shared = reinterpret_cast(shared_var); const int index = threadIdx.y; const int stride = blockDim.y; const int batch_idx = batch * hidden_size; const int batch_block_idx = threadIdx.x * stride; // TODO: use parallel single-pass algorithm to compute moments. // Reduce sum T sum = static_cast(0.0); for (int i = index; i < hidden_size; i += stride) sum += x[batch_idx + i]; shared[batch_block_idx + index] = sum; __syncthreads(); for (int s = stride / 2; s > 0; s >>= 1) { if (index < s) shared[batch_block_idx + index] += shared[batch_block_idx + index + s]; __syncthreads(); } // Make sure this read completes before we start writing to `shared` again below. const T mean = shared[batch_block_idx] / hidden_size; __syncthreads(); // Reduce squared difference T sumsq = static_cast(0.0); for (int i = index; i < hidden_size; i += stride) { const T diff = x[batch_idx + i] - mean; sumsq += diff * diff; } shared[batch_block_idx + index] = sumsq; __syncthreads(); for (int s = stride / 2; s > 0; s >>= 1) { if (index < s) shared[batch_block_idx + index] += shared[batch_block_idx + index + s]; __syncthreads(); } const T invstd = rsqrt(shared[batch_block_idx] / hidden_size + static_cast(1e-5)); for (int i = index; i < hidden_size; i += stride) { if (ApplyBeta) y[batch_idx + i] = (x[batch_idx + i] - mean) * invstd * gamma[i] + beta[i]; else y[batch_idx + i] = (x[batch_idx + i] - mean) * invstd * gamma[i]; } cache[batch * 2 + 0] = mean; cache[batch * 2 + 1] = invstd; } } // anonymous namespace namespace haste { namespace v0 { namespace layer_norm { template ForwardPass::ForwardPass( const int batch_size, const int hidden_size, const T* gamma, const T* beta, T* cache) : batch_size_(batch_size), hidden_size_(hidden_size), gamma_(gamma), beta_(beta), cache_(cache), partial_(0) { } template void ForwardPass::Run(const cudaStream_t& stream, const T* x, T* y) { RunPartial(stream, batch_size_, x, y); } template void ForwardPass::RunPartial( const cudaStream_t& stream, const int minibatch, const T* x, T* y) { assert(partial_ + minibatch <= batch_size_); dim3 blockDim(4, 256); dim3 gridDim; gridDim.x = (minibatch + blockDim.x - 1) / blockDim.x; const int shared_mem_size = sizeof(T) * blockDim.x * blockDim.y; if (beta_) { LayerNorm<<>>( minibatch, hidden_size_, gamma_, beta_, x, y, cache_ + partial_ * 2); } else { LayerNorm<<>>( minibatch, hidden_size_, gamma_, nullptr, x, y, cache_ + partial_ * 2); } partial_ += minibatch; } template class ForwardPass; template class ForwardPass; } // namespace layer_norm } // namespace v0 } // namespace haste ================================================ FILE: lib/layer_norm_gru_backward_gpu.cu.cc ================================================ // Copyright 2020 LMNT, Inc. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // ============================================================================== #include #include #include "blas.h" #include "haste.h" #include "inline_ops.h" namespace { template __global__ void PointwiseOperations(const int batch_dim, const int hidden_dim, const T* h, const T* v, const T* dh_new, T* dbx_out, T* dbr_out, T* dh_inout, T* dp_out, T* dq_out, const T* zoneout_mask) { // Zoneout mask (only used if ApplyZoneout==true) const int row = blockDim.x * blockIdx.x + threadIdx.x; const int col = blockDim.y * blockIdx.y + threadIdx.y; if (row >= hidden_dim || col >= batch_dim) return; const int base_idx = col * hidden_dim + row; T dh_total = dh_new[base_idx] + dh_inout[base_idx]; const int stride4_base_idx = col * (hidden_dim * 4) + row; const int z_idx = stride4_base_idx + 0 * hidden_dim; const int r_idx = stride4_base_idx + 1 * hidden_dim; const int g_idx = stride4_base_idx + 2 * hidden_dim; const int q_g_idx = stride4_base_idx + 3 * hidden_dim; const T z = v[z_idx]; const T r = v[r_idx]; const T g = v[g_idx]; const T q_g = v[q_g_idx]; if (ApplyZoneout) { const T mask = zoneout_mask[base_idx]; dh_inout[base_idx] = (static_cast(1.0) - mask) * dh_total; dh_total = mask * dh_total; dh_inout[base_idx] += z * dh_total; } else { dh_inout[base_idx] = z * dh_total; } const T dg = (static_cast(1.0) - z) * dh_total; const T dz = (h[base_idx] - g) * dh_total; const T dp_g = d_tanh(g) * dg; const T dq_g = dp_g * r; const T dr = dp_g * q_g; const T dp_r = d_sigmoid(r) * dr; const T dq_r = dp_r; const T dp_z = d_sigmoid(z) * dz; const T dq_z = dp_z; const int idx = col * (hidden_dim * 3) + row; dp_out[idx + 0 * hidden_dim] = dp_z; dp_out[idx + 1 * hidden_dim] = dp_r; dp_out[idx + 2 * hidden_dim] = dp_g; dq_out[idx + 0 * hidden_dim] = dq_z; dq_out[idx + 1 * hidden_dim] = dq_r; dq_out[idx + 2 * hidden_dim] = dq_g; atomicAdd(&dbx_out[row + 0 * hidden_dim], dp_z); atomicAdd(&dbx_out[row + 1 * hidden_dim], dp_r); atomicAdd(&dbx_out[row + 2 * hidden_dim], dp_g); atomicAdd(&dbr_out[row + 0 * hidden_dim], dq_z); atomicAdd(&dbr_out[row + 1 * hidden_dim], dq_r); atomicAdd(&dbr_out[row + 2 * hidden_dim], dq_g); } } // anonymous namespace namespace haste { namespace v0 { namespace layer_norm_gru { template struct BackwardPass::private_data { int batch_size; int input_size; int hidden_size; cublasHandle_t blas_handle; cudaStream_t stream[2]; cudaEvent_t event; cudaStream_t sync_stream; }; template BackwardPass::BackwardPass( const int batch_size, const int input_size, const int hidden_size, const cublasHandle_t& blas_handle, const cudaStream_t& stream) : data_(new private_data) { data_->batch_size = batch_size; data_->input_size = input_size; data_->hidden_size = hidden_size; data_->blas_handle = blas_handle; data_->sync_stream = stream; cudaStreamCreate(&data_->stream[0]); cudaStreamCreate(&data_->stream[1]); cudaEventCreateWithFlags(&data_->event, cudaEventDisableTiming); } template BackwardPass::~BackwardPass() { if (data_->sync_stream) { cudaEventRecord(data_->event, data_->stream[1]); cudaStreamWaitEvent(data_->sync_stream, data_->event, 0); cudaEventRecord(data_->event, data_->stream[0]); cudaStreamWaitEvent(data_->sync_stream, data_->event, 0); } else { cudaStreamSynchronize(data_->stream[1]); cudaStreamSynchronize(data_->stream[0]); } cudaEventDestroy(data_->event); cudaStreamDestroy(data_->stream[1]); cudaStreamDestroy(data_->stream[0]); delete data_; } template void BackwardPass::IterateInternal( const T* R_t, // [H*3,H] const T* h, // [N,H] const T* v, // [N,H*4] const T* dh_new, // [N,H] T* dbx, // [H*3] T* dbr, // [H*3] T* dh, // [N,H] T* dp, // [N,H*3] T* dq, // [N,H*3] layer_norm::BackwardPass& layer_norm2, const T* zoneout_mask) { // [N,H] const T alpha = static_cast(1.0); const T beta_sum = static_cast(1.0); const int batch_size = data_->batch_size; const int hidden_size = data_->hidden_size; const cublasHandle_t blas_handle = data_->blas_handle; const cudaStream_t stream1 = data_->stream[0]; const cudaEvent_t event = data_->event; // Compute launch configuration for pointwise operations kernel. const dim3 blockDim(32, 16); const dim3 gridDim( (hidden_size + blockDim.x - 1) / blockDim.x, (batch_size + blockDim.y - 1) / blockDim.y); if (zoneout_mask) { PointwiseOperations<<>>( batch_size, hidden_size, h, v, dh_new, dbx, dbr, dh, dp, dq, zoneout_mask ); } else { PointwiseOperations<<>>( batch_size, hidden_size, h, v, dh_new, dbx, dbr, dh, dp, dq, nullptr ); } cudaEventRecord(event, stream1); cublasSetStream(blas_handle, stream1); layer_norm2.RunPartial(stream1, batch_size, dq, dq); blas::gemm(blas_handle, CUBLAS_OP_N, CUBLAS_OP_N, hidden_size, batch_size, hidden_size * 3, &alpha, R_t, hidden_size, dq, hidden_size * 3, &beta_sum, dh, hidden_size); } template void BackwardPass::Run( const int steps, const T* W_t, const T* R_t, const T* bx, const T* br, const T* x_t, const T* h, const T* v, const T* dh_new, T* dx, T* dW, T* dR, T* dbx, T* dbr, T* dh, T* dp, T* dq, layer_norm::BackwardPass& layer_norm1, layer_norm::BackwardPass& layer_norm2, const T* zoneout_mask) { const T alpha = static_cast(1.0); const T beta_sum = static_cast(1.0); const T beta_assign = static_cast(0.0); const blas::set_pointer_mode scoped1(data_->blas_handle); const int batch_size = data_->batch_size; const int input_size = data_->input_size; const int hidden_size = data_->hidden_size; const cublasHandle_t blas_handle = data_->blas_handle; const cudaStream_t stream1 = data_->stream[0]; const cudaStream_t stream2 = data_->stream[1]; const cudaEvent_t event = data_->event; cudaStream_t save_stream; cublasGetStream(blas_handle, &save_stream); const int NH = batch_size * hidden_size; for (int i = steps - 1; i >= 0; --i) { IterateInternal( R_t, h + i * NH, v + i * NH * 4, dh_new + (i + 1) * NH, dbx, dbr, dh, dp + i * NH * 3, dq + i * NH * 3, layer_norm2, zoneout_mask ? zoneout_mask + i * NH : nullptr); } // Wait for pointwise operations to complete since there's a // data dependency between its output (`dp`, `dq`) and the following matmuls. cudaStreamWaitEvent(stream2, event, 0); cublasSetStream(blas_handle, stream2); layer_norm1.Run(stream2, dp, dp); blas::gemm(blas_handle, CUBLAS_OP_N, CUBLAS_OP_N, input_size, batch_size * steps, hidden_size * 3, &alpha, W_t, input_size, dp, hidden_size * 3, &beta_assign, dx, input_size); cublasSetStream(blas_handle, stream1); blas::gemm(blas_handle, CUBLAS_OP_N, CUBLAS_OP_T, hidden_size * 3, hidden_size, batch_size * steps, &alpha, dq, hidden_size * 3, h, hidden_size, &beta_sum, dR, hidden_size * 3); cublasSetStream(blas_handle, stream2); blas::gemm(blas_handle, CUBLAS_OP_N, CUBLAS_OP_N, hidden_size * 3, input_size, batch_size * steps, &alpha, dp, hidden_size * 3, x_t, batch_size * steps, &beta_sum, dW, hidden_size * 3); cublasSetStream(blas_handle, save_stream); } template struct BackwardPass; template struct BackwardPass; } // namespace layer_norm_gru } // namespace v0 } // namespace haste ================================================ FILE: lib/layer_norm_gru_forward_gpu.cu.cc ================================================ // Copyright 2020 LMNT, Inc. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // ============================================================================== #include #include #include "blas.h" #include "haste.h" #include "inline_ops.h" namespace { template __global__ void PointwiseOperations(const int batch_dim, const int hidden_dim, const T* Wx, const T* Rh, const T* bx, const T* br, const T* h, T* h_out, T* v, const float zoneout_prob, const T* zoneout_mask) { // Zoneout mask (only used if ApplyZoneout==true) const int row = blockDim.x * blockIdx.x + threadIdx.x; const int col = blockDim.y * blockIdx.y + threadIdx.y; if (row >= hidden_dim || col >= batch_dim) return; const int weight_idx = col * (hidden_dim * 3) + row; // Index into the `h` and `h_out` vectors (they have a stride of `hidden_dim`). const int output_idx = col * hidden_dim + row; // Indicies into the Wx and Rh matrices (for each of the u, r, and e components). const int z_idx = weight_idx + 0 * hidden_dim; const int r_idx = weight_idx + 1 * hidden_dim; const int g_idx = weight_idx + 2 * hidden_dim; // Indices into the bias vectors (for each of the u, r, and e components). const int bz_idx = row + 0 * hidden_dim; const int br_idx = row + 1 * hidden_dim; const int bg_idx = row + 2 * hidden_dim; const T z = sigmoid(Wx[z_idx] + Rh[z_idx] + bx[bz_idx] + br[bz_idx]); const T r = sigmoid(Wx[r_idx] + Rh[r_idx] + bx[br_idx] + br[br_idx]); const T g = tanh (Wx[g_idx] + r * (Rh[g_idx] + br[bg_idx]) + bx[bg_idx]); // Store internal activations if we're eventually going to backprop. if (Training) { const int base_v_idx = col * (hidden_dim * 4) + row; v[base_v_idx + 0 * hidden_dim] = z; v[base_v_idx + 1 * hidden_dim] = r; v[base_v_idx + 2 * hidden_dim] = g; v[base_v_idx + 3 * hidden_dim] = Rh[g_idx] + br[bg_idx]; } T cur_h_value = z * h[output_idx] + (static_cast(1.0) - z) * g; if (ApplyZoneout) { if (Training) { cur_h_value = (cur_h_value - h[output_idx]) * zoneout_mask[output_idx] + h[output_idx]; } else { cur_h_value = (zoneout_prob * h[output_idx]) + ((1.0f - zoneout_prob) * cur_h_value); } } h_out[output_idx] = cur_h_value; } } // anonymous namespace namespace haste { namespace v0 { namespace layer_norm_gru { template struct ForwardPass::private_data { bool training; int batch_size; int input_size; int hidden_size; cublasHandle_t blas_handle; cudaStream_t stream[2]; cudaEvent_t event; cudaStream_t sync_stream; }; template ForwardPass::ForwardPass( const bool training, const int batch_size, const int input_size, const int hidden_size, const cublasHandle_t& blas_handle, const cudaStream_t& stream) : data_(new private_data) { data_->training = training; data_->batch_size = batch_size; data_->input_size = input_size; data_->hidden_size = hidden_size; data_->blas_handle = blas_handle; data_->sync_stream = stream; cudaStreamCreate(&data_->stream[0]); cudaStreamCreate(&data_->stream[1]); cudaEventCreateWithFlags(&data_->event, cudaEventDisableTiming); } template ForwardPass::~ForwardPass() { if (data_->sync_stream) { cudaEventRecord(data_->event, data_->stream[1]); cudaStreamWaitEvent(data_->sync_stream, data_->event, 0); cudaEventRecord(data_->event, data_->stream[0]); cudaStreamWaitEvent(data_->sync_stream, data_->event, 0); } else { cudaStreamSynchronize(data_->stream[1]); cudaStreamSynchronize(data_->stream[0]); } cudaEventDestroy(data_->event); cudaStreamDestroy(data_->stream[1]); cudaStreamDestroy(data_->stream[0]); delete data_; } template void ForwardPass::IterateInternal( const T* R, // [H,H*3] const T* bx, // [H*3] const T* br, // [H*3] const T* h, // [N,H] T* h_out, // [N,H] T* v, // [N,H*4] T* tmp_Wx_norm, // [N,H*3] T* act_Rh, // [N,H*3] layer_norm::ForwardPass& layer_norm2, T* tmp_Rh_norm, const float zoneout_prob, const T* zoneout_mask) { // Zoneout mask [N,H] // Constants for GEMM static const T alpha = static_cast(1.0); static const T beta = static_cast(0.0); const bool training = data_->training; const int batch_size = data_->batch_size; const int hidden_size = data_->hidden_size; const cublasHandle_t blas_handle = data_->blas_handle; const cudaStream_t stream1 = data_->stream[0]; const cudaEvent_t event = data_->event; cublasSetStream(blas_handle, stream1); blas::gemm(blas_handle, CUBLAS_OP_N, CUBLAS_OP_N, hidden_size * 3, batch_size, hidden_size, &alpha, R, hidden_size * 3, h, hidden_size, &beta, act_Rh, hidden_size * 3); layer_norm2.RunPartial(stream1, batch_size, act_Rh, tmp_Rh_norm); // Compute launch configuration for pointwise operations kernel. const dim3 blockDim(32, 16); const dim3 gridDim( (hidden_size + blockDim.x - 1) / blockDim.x, (batch_size + blockDim.y - 1) / blockDim.y); cudaStreamWaitEvent(stream1, event, 0); if (training) { if (zoneout_prob && zoneout_mask) { PointwiseOperations<<>>( batch_size, hidden_size, tmp_Wx_norm, tmp_Rh_norm, bx, br, h, h_out, v, zoneout_prob, zoneout_mask); } else { PointwiseOperations<<>>( batch_size, hidden_size, tmp_Wx_norm, tmp_Rh_norm, bx, br, h, h_out, v, 0.0f, nullptr); } } else { if (zoneout_prob && zoneout_mask) { PointwiseOperations<<>>( batch_size, hidden_size, tmp_Wx_norm, tmp_Rh_norm, bx, br, h, h_out, nullptr, zoneout_prob, zoneout_mask); } else { PointwiseOperations<<>>( batch_size, hidden_size, tmp_Wx_norm, tmp_Rh_norm, bx, br, h, h_out, nullptr, 0.0f, nullptr); } } } template void ForwardPass::Run( const int steps, const T* W, // [C,H*3] const T* R, // [H,H*3] const T* bx, // [H*3] const T* br, // [H*3] const T* x, // [N,C] T* h, // [N,H] T* v, // [N,H*4] T* act_Wx, // [N,H*3] layer_norm::ForwardPass& layer_norm1, T* tmp_Wx_norm, T* act_Rh, // [N,H*3] layer_norm::ForwardPass& layer_norm2, T* tmp_Rh_norm, const float zoneout_prob, const T* zoneout_mask) { // Zoneout mask [N,H] static const T alpha = static_cast(1.0); static const T beta = static_cast(0.0); const blas::set_pointer_mode scoped1(data_->blas_handle); const int batch_size = data_->batch_size; const int input_size = data_->input_size; const int hidden_size = data_->hidden_size; const cublasHandle_t blas_handle = data_->blas_handle; const cudaStream_t stream2 = data_->stream[1]; const cudaEvent_t event = data_->event; cudaStream_t save_stream; cublasGetStream(blas_handle, &save_stream); cublasSetStream(blas_handle, stream2); blas::gemm(blas_handle, CUBLAS_OP_N, CUBLAS_OP_N, hidden_size * 3, steps * batch_size, input_size, &alpha, W, hidden_size * 3, x, input_size, &beta, act_Wx, hidden_size * 3); layer_norm1.Run(stream2, act_Wx, tmp_Wx_norm); cudaEventRecord(event, stream2); const int NH = batch_size * hidden_size; for (int i = 0; i < steps; ++i) { IterateInternal( R, bx, br, h + i * NH, h + (i + 1) * NH, v + i * NH * 4, tmp_Wx_norm + i * NH * 3, act_Rh + i * NH * 3, layer_norm2, tmp_Rh_norm, zoneout_prob, zoneout_mask ? zoneout_mask + i * NH : nullptr); } cublasSetStream(blas_handle, save_stream); } template struct ForwardPass; template struct ForwardPass; } // namespace layer_norm_gru } // namespace v0 } // namespace haste ================================================ FILE: lib/layer_norm_indrnn_backward_gpu.cu.cc ================================================ // Copyright 2020 LMNT, Inc. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // ============================================================================== #include #include #include "blas.h" #include "haste.h" #include "inline_ops.h" namespace { template __global__ void LayerNormIndrnnBwdOps( const int steps, const int batch_size, const int hidden_size, const T* u, const T* h_prev, const T* h, const T* dh_new, T* du_out, T* db_out, T* dh_inout, T* dk_out, const T* zoneout_mask) { const int row = blockDim.x * blockIdx.x + threadIdx.x; const int col = blockDim.y * blockIdx.y + threadIdx.y; if (row >= hidden_size || col >= batch_size) return; const int NH = batch_size * hidden_size; const int idx = col * hidden_size + row; const T u_row = u[row]; T dh_inout_idx = dh_inout[idx]; T du_sum = static_cast(0.0); T db_sum = static_cast(0.0); for (int i = (steps - 1) * NH; i >= 0; i -= NH) { T dh_total = dh_new[idx + i] + dh_inout_idx; T dh = static_cast(0.0); if (ApplyZoneout) { const T mask = zoneout_mask[idx + i]; dh = (static_cast(1.0) - mask) * dh_total; dh_total = mask * dh_total; } const T dk = d_tanh(h[idx + i]) * dh_total; dk_out[idx + i] = dk; dh_inout_idx = dh + u_row * dk; du_sum += h_prev[idx + i] * dk; db_sum += dk; } dh_inout[idx] = dh_inout_idx; atomicAdd(&du_out[row], du_sum); atomicAdd(&db_out[row], db_sum); } } // anonymous namespace namespace haste { namespace v0 { namespace layer_norm_indrnn { template struct BackwardPass::private_data { int batch_size; int input_size; int hidden_size; cublasHandle_t blas_handle; cudaStream_t stream; cudaStream_t sync_stream; }; template BackwardPass::BackwardPass( const int batch_size, const int input_size, const int hidden_size, const cublasHandle_t& blas_handle, const cudaStream_t& stream) : data_(new private_data) { data_->batch_size = batch_size; data_->input_size = input_size; data_->hidden_size = hidden_size; data_->blas_handle = blas_handle; data_->sync_stream = stream; cudaStreamCreate(&data_->stream); } template BackwardPass::~BackwardPass() { if (data_->sync_stream) { cudaEvent_t event; cudaEventCreateWithFlags(&event, cudaEventDisableTiming); cudaEventRecord(event, data_->stream); cudaStreamWaitEvent(data_->sync_stream, event, 0); cudaEventDestroy(event); } else { cudaStreamSynchronize(data_->stream); } cudaStreamDestroy(data_->stream); delete data_; } template void BackwardPass::Run( const int steps, const T* W_t, const T* u, const T* b, const T* x_t, const T* h, const T* dh_new, T* dx, T* dW, T* du, T* db, T* dh, T* workspace, layer_norm::BackwardPass& layer_norm1, const T* zoneout_mask) { const T alpha = static_cast(1.0); const T beta = static_cast(0.0); const blas::set_pointer_mode scoped1(data_->blas_handle); const int batch_size = data_->batch_size; const int input_size = data_->input_size; const int hidden_size = data_->hidden_size; const cublasHandle_t blas_handle = data_->blas_handle; const cudaStream_t stream = data_->stream; const dim3 blockDim(64, 16); const dim3 gridDim( (hidden_size + blockDim.x - 1) / blockDim.x, (batch_size + blockDim.y - 1) / blockDim.y); const int NH = batch_size * hidden_size; if (zoneout_mask) { LayerNormIndrnnBwdOps<<>>( steps, batch_size, hidden_size, u, h, h + NH, dh_new + NH, du, db, dh, workspace, zoneout_mask); } else { LayerNormIndrnnBwdOps<<>>( steps, batch_size, hidden_size, u, h, h + NH, dh_new + NH, du, db, dh, workspace, nullptr); } cudaStream_t save_stream; cublasGetStream(blas_handle, &save_stream); cublasSetStream(blas_handle, stream); layer_norm1.Run(stream, workspace, workspace); blas::gemm(blas_handle, CUBLAS_OP_N, CUBLAS_OP_N, hidden_size, input_size, batch_size * steps, &alpha, workspace, hidden_size, x_t, batch_size * steps, &beta, dW, hidden_size); blas::gemm(blas_handle, CUBLAS_OP_N, CUBLAS_OP_N, input_size, steps * batch_size, hidden_size, &alpha, W_t, input_size, workspace, hidden_size, &beta, dx, input_size); cublasSetStream(blas_handle, save_stream); } template class BackwardPass; template class BackwardPass; } // namespace layer_norm_indrnn } // namespace v0 } // namespace haste ================================================ FILE: lib/layer_norm_indrnn_forward_gpu.cu.cc ================================================ // Copyright 2020 LMNT, Inc. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // ============================================================================== #include #include #include "blas.h" #include "haste.h" #include "inline_ops.h" namespace { template __global__ void LayerNormIndrnnFwdOps( const int steps, const int batch_size, const int hidden_size, const T* Wx, const T* u, const T* b, const T* h, T* h_out, const float zoneout_prob, const T* zoneout_mask) { const int row = blockDim.x * blockIdx.x + threadIdx.x; const int col = blockDim.y * blockIdx.y + threadIdx.y; if (row >= hidden_size || col >= batch_size) return; const int idx = col * hidden_size + row; const int NH = batch_size * hidden_size; const T u_row = u[row]; const T b_row = b[row]; for (int i = 0; i < steps * NH; i += NH) { const T a = Wx[idx + i] + u_row * h[idx + i] + b_row; T cur_h_value = tanh(a); if (ApplyZoneout) { if (Training) { cur_h_value = (cur_h_value - h[idx + i]) * zoneout_mask[idx + i] + h[idx + i]; } else { cur_h_value = (zoneout_prob * h[idx + i]) + ((1.0f - zoneout_prob) * cur_h_value); } } h_out[idx + i] = cur_h_value; } } } // anonymous namespace namespace haste { namespace v0 { namespace layer_norm_indrnn { template struct ForwardPass::private_data { bool training; int batch_size; int input_size; int hidden_size; cublasHandle_t blas_handle; cudaStream_t stream; cudaStream_t sync_stream; }; template ForwardPass::ForwardPass( const bool training, const int batch_size, const int input_size, const int hidden_size, const cublasHandle_t& blas_handle, const cudaStream_t& stream) : data_(new private_data) { data_->training = training; data_->batch_size = batch_size; data_->input_size = input_size; data_->hidden_size = hidden_size; data_->blas_handle = blas_handle; data_->sync_stream = stream; cudaStreamCreate(&data_->stream); } template ForwardPass::~ForwardPass() { if (data_->sync_stream) { cudaEvent_t event; cudaEventCreateWithFlags(&event, cudaEventDisableTiming); cudaEventRecord(event, data_->stream); cudaStreamWaitEvent(data_->sync_stream, event, 0); cudaEventDestroy(event); } else { cudaStreamSynchronize(data_->stream); } cudaStreamDestroy(data_->stream); delete data_; } template void ForwardPass::Run( const int steps, const T* W, const T* u, const T* b, const T* x, T* h, T* workspace, T* act_Wx, layer_norm::ForwardPass& layer_norm1, const float zoneout_prob, const T* zoneout_mask) { static const T alpha = static_cast(1.0); static const T beta = static_cast(0.0); const blas::set_pointer_mode scoped1(data_->blas_handle); const bool training = data_->training; const int batch_size = data_->batch_size; const int input_size = data_->input_size; const int hidden_size = data_->hidden_size; const cublasHandle_t blas_handle = data_->blas_handle; const cudaStream_t stream = data_->stream; cudaStream_t save_stream; cublasGetStream(blas_handle, &save_stream); cublasSetStream(blas_handle, stream); blas::gemm(blas_handle, CUBLAS_OP_N, CUBLAS_OP_N, hidden_size, steps * batch_size, input_size, &alpha, W, hidden_size, x, input_size, &beta, act_Wx, hidden_size); layer_norm1.Run(stream, act_Wx, workspace); const dim3 blockDim(64, 16); const dim3 gridDim( (hidden_size + blockDim.x - 1) / blockDim.x, (batch_size + blockDim.y - 1) / blockDim.y); const int NH = batch_size * hidden_size; if (training) { if (zoneout_prob && zoneout_mask) { LayerNormIndrnnFwdOps<<>>( steps, batch_size, hidden_size, workspace, u, b, h, h + NH, zoneout_prob, zoneout_mask); } else { LayerNormIndrnnFwdOps<<>>( steps, batch_size, hidden_size, workspace, u, b, h, h + NH, 0.0f, nullptr); } } else { if (zoneout_prob && zoneout_mask) { LayerNormIndrnnFwdOps<<>>( steps, batch_size, hidden_size, workspace, u, b, h, h + NH, zoneout_prob, zoneout_mask); } else { LayerNormIndrnnFwdOps<<>>( steps, batch_size, hidden_size, workspace, u, b, h, h + NH, 0.0f, nullptr); } } cublasSetStream(blas_handle, save_stream); } template class ForwardPass; template class ForwardPass; } // namespace layer_norm_indrnn } // namespace v0 } // namespace haste ================================================ FILE: lib/layer_norm_lstm_backward_gpu.cu.cc ================================================ // Copyright 2020 LMNT, Inc. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // ============================================================================== #include #include #include #include #include "blas.h" #include "haste.h" #include "inline_ops.h" namespace { template __global__ void ComputeOutputGrad( const int batch_size, const int hidden_size, const T* act_c_norm, const T* dh_new, T* dh_inout, T* dlayer_norm, T* v, const T* zoneout_mask) { const int row = blockDim.x * blockIdx.x + threadIdx.x; const int col = blockDim.y * blockIdx.y + threadIdx.y; if (row >= hidden_size || col >= batch_size) return; const int base_idx = col * hidden_size + row; const int stride4_base_idx = col * (hidden_size * 4) + row; const int o_idx = stride4_base_idx + 3 * hidden_size; T dh_total = dh_new[base_idx] + dh_inout[base_idx]; if (ApplyZoneout) { const T mask = zoneout_mask[base_idx]; dh_inout[base_idx] = (static_cast(1.0) - mask) * dh_total; dh_total = mask * dh_total; } else { dh_inout[base_idx] = static_cast(0.0); } const T c_tanh = tanh(act_c_norm[base_idx]); const T o = v[o_idx]; const T do_ = c_tanh * dh_total; const T dc_tanh = o * dh_total; dlayer_norm[base_idx] = d_tanh(c_tanh) * dc_tanh; v[o_idx] = d_sigmoid(o) * do_; } template __global__ void PointwiseOperations(const int batch_dim, const int hidden_dim, const T* c, const T* v, const T* dc_new, const T* dlayer_norm, T* db_out, T* dc_inout, T* dv_out) { const int row = blockDim.x * blockIdx.x + threadIdx.x; const int col = blockDim.y * blockIdx.y + threadIdx.y; if (row >= hidden_dim || col >= batch_dim) return; const int base_idx = col * hidden_dim + row; const int stride4_base_idx = col * (hidden_dim * 4) + row; const int i_idx = stride4_base_idx + 0 * hidden_dim; const int g_idx = stride4_base_idx + 1 * hidden_dim; const int f_idx = stride4_base_idx + 2 * hidden_dim; const int o_idx = stride4_base_idx + 3 * hidden_dim; const T i = v[i_idx]; const T g = v[g_idx]; const T f = v[f_idx]; const T o = v[o_idx]; const T dc_total = dc_new[base_idx] + dc_inout[base_idx] + dlayer_norm[base_idx]; const T df = c[base_idx] * dc_total; const T dc = f * dc_total; const T di = g * dc_total; const T dg = i * dc_total; const T dv_g = d_tanh(g) * dg; const T dv_o = o; const T dv_i = d_sigmoid(i) * di; const T dv_f = d_sigmoid(f) * df; // TODO: performance optimization opportunity on this reduce operation. atomicAdd(&db_out[row + 0 * hidden_dim], dv_i); atomicAdd(&db_out[row + 1 * hidden_dim], dv_g); atomicAdd(&db_out[row + 2 * hidden_dim], dv_f); atomicAdd(&db_out[row + 3 * hidden_dim], dv_o); dc_inout[base_idx] = dc; dv_out[i_idx] = dv_i; dv_out[g_idx] = dv_g; dv_out[f_idx] = dv_f; dv_out[o_idx] = dv_o; } } // anonymous namespace namespace haste { namespace v0 { namespace layer_norm_lstm { template struct BackwardPass::private_data { int batch_size; int input_size; int hidden_size; cublasHandle_t blas_handle; cudaStream_t stream[3]; cudaEvent_t event; cudaStream_t sync_stream; }; template BackwardPass::BackwardPass( const int batch_size, const int input_size, const int hidden_size, const cublasHandle_t& blas_handle, const cudaStream_t& stream) : data_(new private_data) { data_->batch_size = batch_size; data_->input_size = input_size; data_->hidden_size = hidden_size; data_->blas_handle = blas_handle; data_->sync_stream = stream; cudaStreamCreate(&data_->stream[0]); cudaStreamCreate(&data_->stream[1]); cudaStreamCreate(&data_->stream[2]); cudaEventCreateWithFlags(&data_->event, cudaEventDisableTiming); } template BackwardPass::~BackwardPass() { if (data_->sync_stream) { cudaEventRecord(data_->event, data_->stream[2]); cudaStreamWaitEvent(data_->sync_stream, data_->event, 0); cudaEventRecord(data_->event, data_->stream[1]); cudaStreamWaitEvent(data_->sync_stream, data_->event, 0); cudaEventRecord(data_->event, data_->stream[0]); cudaStreamWaitEvent(data_->sync_stream, data_->event, 0); } else { cudaStreamSynchronize(data_->stream[2]); cudaStreamSynchronize(data_->stream[1]); cudaStreamSynchronize(data_->stream[0]); } cudaEventDestroy(data_->event); cudaStreamDestroy(data_->stream[2]); cudaStreamDestroy(data_->stream[1]); cudaStreamDestroy(data_->stream[0]); delete data_; } template void BackwardPass::IterateInternal( const T* R_t, // [H*4,H] const T* c, // [N,H] const T* c_new, // [N,H] const T* dh_new, // [N,H] const T* dc_new, // [N,H] T* db, // [H*4] T* dh, // [N,H] T* dc, // [N,H] T* v, // [N,H*4] T* act_Rh, layer_norm::BackwardPass& layer_norm2, layer_norm::BackwardPass& layer_norm3, T* act_c_norm, const T* zoneout_mask) { const T alpha = static_cast(1.0); const T beta_sum = static_cast(1.0); // Accumulate into output matrix! const int batch_size = data_->batch_size; const int hidden_size = data_->hidden_size; const cublasHandle_t blas_handle = data_->blas_handle; const cudaStream_t stream1 = data_->stream[0]; const cudaEvent_t event = data_->event; // Compute launch configuration for pointwise operations kernel. const dim3 blockDim(64, 16); const dim3 gridDim( (hidden_size + blockDim.x - 1) / blockDim.x, (batch_size + blockDim.y - 1) / blockDim.y); if (zoneout_mask) { ComputeOutputGrad<<>>( batch_size, hidden_size, act_c_norm, dh_new, dh, act_c_norm, v, zoneout_mask); } else { ComputeOutputGrad<<>>( batch_size, hidden_size, act_c_norm, dh_new, dh, act_c_norm, v, nullptr); } layer_norm3.RunPartial(stream1, batch_size, act_c_norm, act_c_norm); PointwiseOperations<<>>( batch_size, hidden_size, c, v, dc_new, act_c_norm, db, dc, v); // Signal completion of pointwise operations for data-dependent streams. cudaEventRecord(event, stream1); cublasSetStream(blas_handle, stream1); layer_norm2.RunPartial(stream1, batch_size, v, act_Rh); blas::gemm(blas_handle, CUBLAS_OP_N, CUBLAS_OP_N, hidden_size, batch_size, hidden_size * 4, &alpha, R_t, hidden_size, act_Rh, hidden_size * 4, &beta_sum, dh, hidden_size); } template void BackwardPass::Run( const int steps, const T* W_t, // [H*4,C] const T* R_t, // [H*4,H] const T* b, // [H*4] const T* x_t, // [C,T,N] const T* h, // [T+1,N,H] const T* c, // [T+1,N,H] const T* dh_new, // [T+1,N,H] const T* dc_new, // [T+1,N,H] T* dx, // [T,N,C] T* dW, // [C,H*4] T* dR, // [H,H*4] T* db, // [H*4] T* dh, // [N,H] T* dc, // [N,H] T* act_Wx, // [T,N,H*4] layer_norm::BackwardPass& layer_norm1, T* act_Wx_norm, // [T,N,H*4] T* act_Rh, layer_norm::BackwardPass& layer_norm2, layer_norm::BackwardPass& layer_norm3, T* act_c_norm, const T* zoneout_mask) { const T alpha = static_cast(1.0); const T beta_sum = static_cast(1.0); // Accumulate into output matrix! const T beta_assign = static_cast(0.0); const blas::set_pointer_mode scoped1(data_->blas_handle); const int batch_size = data_->batch_size; const int input_size = data_->input_size; const int hidden_size = data_->hidden_size; const cublasHandle_t blas_handle = data_->blas_handle; const cudaStream_t stream1 = data_->stream[0]; const cudaStream_t stream2 = data_->stream[1]; const cudaStream_t stream3 = data_->stream[2]; const cudaEvent_t event = data_->event; cudaStream_t save_stream; cublasGetStream(blas_handle, &save_stream); const int NH = batch_size * hidden_size; for (int i = steps - 1; i >= 0; --i) { IterateInternal( R_t, c + i * NH, c + (i + 1) * NH, dh_new + (i + 1) * NH, dc_new + (i + 1) * NH, db, dh, dc, act_Wx_norm + i * NH * 4, act_Rh + i * NH * 4, layer_norm2, layer_norm3, act_c_norm + i * NH, zoneout_mask ? zoneout_mask + i * NH : nullptr); } cudaEventRecord(event, stream1); cudaStreamWaitEvent(stream2, event, 0); layer_norm1.Run(stream2, act_Wx_norm, act_Wx); cublasSetStream(blas_handle, stream2); blas::gemm(blas_handle, CUBLAS_OP_N, CUBLAS_OP_N, hidden_size * 4, input_size, batch_size * steps, &alpha, act_Wx, hidden_size * 4, x_t, batch_size * steps, &beta_sum, dW, hidden_size * 4); cudaStreamWaitEvent(stream3, event, 0); cublasSetStream(blas_handle, stream1); blas::gemm(blas_handle, CUBLAS_OP_N, CUBLAS_OP_T, hidden_size * 4, hidden_size, batch_size * steps, &alpha, act_Rh, hidden_size * 4, h, hidden_size, &beta_sum, dR, hidden_size * 4); cublasSetStream(blas_handle, stream2); blas::gemm(blas_handle, CUBLAS_OP_N, CUBLAS_OP_N, input_size, steps * batch_size, hidden_size * 4, &alpha, W_t, input_size, act_Wx, hidden_size * 4, &beta_assign, dx, input_size); cublasSetStream(blas_handle, save_stream); } template struct BackwardPass; template struct BackwardPass; } // namespace layer_norm_lstm } // namespace v0 } // namespace haste ================================================ FILE: lib/layer_norm_lstm_forward_gpu.cu.cc ================================================ // Copyright 2020 LMNT, Inc. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // ============================================================================== #include #include #include "blas.h" #include "haste.h" #include "inline_ops.h" namespace { // `c` and `c_out` may be aliased. template __global__ void ComputeCellState( const int batch_size, const int hidden_size, const T* Wx, // Precomputed (Wx) vector const T* Rh, // Precomputed (Rh) vector const T* b, // Bias for gates const T* c, // Input cell state T* c_out, // Output cell state T* v_out) { // Output vector v (Wx + Rh + b) const int row = blockDim.x * blockIdx.x + threadIdx.x; const int col = blockDim.y * blockIdx.y + threadIdx.y; if (row >= hidden_size || col >= batch_size) return; // Base index into the Wx and Rh matrices. const int weight_idx = col * (hidden_size * 4) + row; // Base index into the output matrix. This is different from `weight_idx` because // the number of rows are different between the two sets of matrices. const int output_idx = col * hidden_size + row; const int i_idx = weight_idx + 0 * hidden_size; const int g_idx = weight_idx + 1 * hidden_size; const int f_idx = weight_idx + 2 * hidden_size; const int o_idx = weight_idx + 3 * hidden_size; const T i = sigmoid(Wx[i_idx] + Rh[i_idx] + b[row + 0 * hidden_size]); const T g = tanh (Wx[g_idx] + Rh[g_idx] + b[row + 1 * hidden_size]); const T f = sigmoid(Wx[f_idx] + Rh[f_idx] + b[row + 2 * hidden_size]); const T o = sigmoid(Wx[o_idx] + Rh[o_idx] + b[row + 3 * hidden_size]); // Compile-time constant branch should be eliminated by compiler so we have // straight-through code. if (Training) { v_out[i_idx] = i; v_out[g_idx] = g; v_out[f_idx] = f; v_out[o_idx] = o; } else { v_out[o_idx] = o; } c_out[output_idx] = (f * c[output_idx]) + (i * g); } // `h` and `h_out` may be aliased. template __global__ void ComputeCellOutput( const int batch_size, const int hidden_size, const T* h, // Input recurrent state const T* c, // Input cell state const T* v, T* h_out, // Output recurrent state const float zoneout_prob, const T* zoneout_mask) { // Zoneout mask (only used if ApplyZoneout==true) const int row = blockDim.x * blockIdx.x + threadIdx.x; const int col = blockDim.y * blockIdx.y + threadIdx.y; if (row >= hidden_size || col >= batch_size) return; const int weight_idx = col * (hidden_size * 4) + row; const int output_idx = col * hidden_size + row; const T o = v[weight_idx + 3 * hidden_size]; const T cur_c_value = c[output_idx]; T cur_h_value = o * tanh(cur_c_value); if (ApplyZoneout) { if (Training) { cur_h_value = (cur_h_value - h[output_idx]) * zoneout_mask[output_idx] + h[output_idx]; } else { cur_h_value = (zoneout_prob * h[output_idx]) + ((1.0f - zoneout_prob) * cur_h_value); } } h_out[output_idx] = cur_h_value; } } // anonymous namespace namespace haste { namespace v0 { namespace layer_norm_lstm { template struct ForwardPass::private_data { bool training; int batch_size; int input_size; int hidden_size; cublasHandle_t blas_handle; cudaStream_t stream[2]; cudaEvent_t event; cudaStream_t sync_stream; }; template ForwardPass::ForwardPass( const bool training, const int batch_size, const int input_size, const int hidden_size, const cublasHandle_t& blas_handle, const cudaStream_t& stream) : data_(new private_data) { data_->training = training; data_->batch_size = batch_size; data_->input_size = input_size; data_->hidden_size = hidden_size; data_->blas_handle = blas_handle; data_->sync_stream = stream; cudaStreamCreate(&data_->stream[0]); cudaStreamCreate(&data_->stream[1]); cudaEventCreateWithFlags(&data_->event, cudaEventDisableTiming); } template ForwardPass::~ForwardPass() { if (data_->sync_stream) { cudaEventRecord(data_->event, data_->stream[1]); cudaStreamWaitEvent(data_->sync_stream, data_->event, 0); cudaEventRecord(data_->event, data_->stream[0]); cudaStreamWaitEvent(data_->sync_stream, data_->event, 0); } else { cudaStreamSynchronize(data_->stream[1]); cudaStreamSynchronize(data_->stream[0]); } cudaEventDestroy(data_->event); cudaStreamDestroy(data_->stream[1]); cudaStreamDestroy(data_->stream[0]); delete data_; } template void ForwardPass::IterateInternal( const T* R, // Weight matrix for recurrent state (Rh) [H,H*4] const T* b, // Bias for gates (Wx + Rh + b) [H*4] const T* h, // Recurrent state [N,H] const T* c, // Cell state [N,H] T* h_out, // Output recurrent state [N,H] T* c_out, // Output cell state [N,H] T* v, // Output vector (Wx + Rh + b) [N,H*4] T* tmp_Rh, // Temporary storage for Rh vector [N,H*4] T* act_Rh, layer_norm::ForwardPass& layer_norm2, layer_norm::ForwardPass& layer_norm3, T* act_c_norm, const float zoneout_prob, const T* zoneout_mask) { // Zoneout mask [N,H] static const T alpha = static_cast(1.0); static const T beta = static_cast(0.0); const bool training = data_->training; const int batch_size = data_->batch_size; const int hidden_size = data_->hidden_size; const cublasHandle_t blas_handle = data_->blas_handle; const cudaStream_t stream1 = data_->stream[0]; const cudaEvent_t event = data_->event; cublasSetStream(blas_handle, stream1); blas::gemm(blas_handle, CUBLAS_OP_N, CUBLAS_OP_N, hidden_size * 4, batch_size, hidden_size, &alpha, R, hidden_size * 4, h, hidden_size, &beta, act_Rh, hidden_size * 4); layer_norm2.RunPartial(stream1, batch_size, act_Rh, tmp_Rh); cudaStreamWaitEvent(stream1, event, 0); // Compute launch configuration for pointwise operations kernel. const dim3 blockDim(64, 16); const dim3 gridDim( (hidden_size + blockDim.x - 1) / blockDim.x, (batch_size + blockDim.y - 1) / blockDim.y); if (training) { ComputeCellState<<>>( batch_size, hidden_size, v, tmp_Rh, b, c, c_out, v); layer_norm3.RunPartial(stream1, batch_size, c_out, act_c_norm); if (zoneout_prob && zoneout_mask) { ComputeCellOutput<<>>( batch_size, hidden_size, h, act_c_norm, v, h_out, zoneout_prob, zoneout_mask); } else { ComputeCellOutput<<>>( batch_size, hidden_size, h, act_c_norm, v, h_out, 0.0f, nullptr); } } else { ComputeCellState<<>>( batch_size, hidden_size, v, tmp_Rh, b, c, c_out, v); layer_norm3.RunPartial(stream1, batch_size, c_out, act_c_norm); if (zoneout_prob && zoneout_mask) { ComputeCellOutput<<>>( batch_size, hidden_size, h, act_c_norm, v, h_out, zoneout_prob, zoneout_mask); } else { ComputeCellOutput<<>>( batch_size, hidden_size, h, act_c_norm, v, h_out, 0.0f, nullptr); } } } template void ForwardPass::Run( const int steps, const T* W, // Weight matrix for input (Wx) [C,H*4] const T* R, // Weight matrix for recurrent state (Rh) [H,H*4] const T* b, // Bias for gates (Wx + Rh + b) [H*4] const T* x, // Input vector [T,N,C] T* h, // Recurrent state [T+1,N,H] T* c, // Cell state [T+1,N,H] T* act_Wx, // Output vector (Wx + Rh + b) [T,N,H*4] T* tmp_Rh, // Temporary storage for Rh vector [N,H*4] layer_norm::ForwardPass& layer_norm1, T* act_Wx_norm, T* act_Rh, layer_norm::ForwardPass& layer_norm2, layer_norm::ForwardPass& layer_norm3, T* act_c_norm, const float zoneout_prob, const T* zoneout_mask) { // Zoneout mask [T,N,H] static const T alpha = static_cast(1.0); static const T beta = static_cast(0.0); const blas::set_pointer_mode scoped1(data_->blas_handle); const int batch_size = data_->batch_size; const int input_size = data_->input_size; const int hidden_size = data_->hidden_size; const cublasHandle_t blas_handle = data_->blas_handle; const cudaStream_t stream1 = data_->stream[0]; cudaStream_t save_stream; cublasGetStream(blas_handle, &save_stream); cublasSetStream(blas_handle, stream1); blas::gemm(blas_handle, CUBLAS_OP_N, CUBLAS_OP_N, hidden_size * 4, steps * batch_size, input_size, &alpha, W, hidden_size * 4, x, input_size, &beta, act_Wx, hidden_size * 4); layer_norm1.Run(stream1, act_Wx, act_Wx_norm); for (int i = 0; i < steps; ++i) { const int NH = batch_size * hidden_size; IterateInternal( R, b, h + i * NH, c + i * NH, h + (i + 1) * NH, c + (i + 1) * NH, act_Wx_norm + i * NH * 4, tmp_Rh, act_Rh + i * NH * 4, layer_norm2, layer_norm3, act_c_norm + i * NH, zoneout_prob, zoneout_mask ? zoneout_mask + i * NH : nullptr); } cublasSetStream(blas_handle, save_stream); } template struct ForwardPass; template struct ForwardPass; } // namespace layer_norm_lstm } // namespace v0 } // namespace haste ================================================ FILE: lib/lstm_backward_gpu.cu.cc ================================================ // Copyright 2020 LMNT, Inc. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // ============================================================================== #include #include #include #include #include "blas.h" #include "haste.h" #include "inline_ops.h" namespace { template __global__ void PointwiseOperations(const int batch_dim, const int hidden_dim, const T* c, const T* v, const T* c_new, const T* dh_new, const T* dc_new, T* db_out, T* dh_inout, T* dc_inout, T* dv_out, const T* zoneout_mask) { // Zoneout mask (only used if ApplyZoneout==true) const int row = blockDim.x * blockIdx.x + threadIdx.x; const int col = blockDim.y * blockIdx.y + threadIdx.y; if (row >= hidden_dim || col >= batch_dim) return; const int base_idx = col * hidden_dim + row; T dc_total = dc_new[base_idx] + dc_inout[base_idx]; T dh_total = dh_new[base_idx] + dh_inout[base_idx]; const T c_tanh = tanh(c_new[base_idx]); const int stride4_base_idx = col * (hidden_dim * 4) + row; const int i_idx = stride4_base_idx + 0 * hidden_dim; const int g_idx = stride4_base_idx + 1 * hidden_dim; const int f_idx = stride4_base_idx + 2 * hidden_dim; const int o_idx = stride4_base_idx + 3 * hidden_dim; const T i = v[i_idx]; const T g = v[g_idx]; const T f = v[f_idx]; const T o = v[o_idx]; if (ApplyZoneout) { const T mask = zoneout_mask[base_idx]; dh_inout[base_idx] = (static_cast(1.0) - mask) * dh_total; dh_total = mask * dh_total; } else { dh_inout[base_idx] = static_cast(0.0); } const T do_ = c_tanh * dh_total; const T dc_tanh = o * dh_total; dc_total += d_tanh(c_tanh) * dc_tanh; const T df = c[base_idx] * dc_total; const T dc = f * dc_total; const T di = g * dc_total; const T dg = i * dc_total; const T dv_g = d_tanh(g) * dg; const T dv_o = d_sigmoid(o) * do_; const T dv_i = d_sigmoid(i) * di; const T dv_f = d_sigmoid(f) * df; // TODO: performance optimization opportunity on this reduce operation. atomicAdd(&db_out[row + 0 * hidden_dim], dv_i); atomicAdd(&db_out[row + 1 * hidden_dim], dv_g); atomicAdd(&db_out[row + 2 * hidden_dim], dv_f); atomicAdd(&db_out[row + 3 * hidden_dim], dv_o); dc_inout[base_idx] = dc; dv_out[i_idx] = dv_i; dv_out[g_idx] = dv_g; dv_out[f_idx] = dv_f; dv_out[o_idx] = dv_o; } } // anonymous namespace namespace haste { namespace v0 { namespace lstm { template struct BackwardPass::private_data { int batch_size; int input_size; int hidden_size; cublasHandle_t blas_handle; cudaStream_t stream[3]; cudaEvent_t event; }; template BackwardPass::BackwardPass( const int batch_size, const int input_size, const int hidden_size, const cublasHandle_t& blas_handle, const cudaStream_t& stream) : data_(new private_data) { data_->batch_size = batch_size; data_->input_size = input_size; data_->hidden_size = hidden_size; data_->blas_handle = blas_handle; cudaStreamCreate(&data_->stream[0]); cudaStreamCreate(&data_->stream[1]); cudaStreamCreate(&data_->stream[2]); cudaEventCreateWithFlags(&data_->event, cudaEventDisableTiming); } template BackwardPass::~BackwardPass() { cudaStreamSynchronize(data_->stream[2]); cudaStreamSynchronize(data_->stream[1]); cudaStreamSynchronize(data_->stream[0]); cudaEventDestroy(data_->event); cudaStreamDestroy(data_->stream[2]); cudaStreamDestroy(data_->stream[1]); cudaStreamDestroy(data_->stream[0]); delete data_; } template void BackwardPass::Iterate( const cudaStream_t& stream, const T* W_t, // [H*4,C] const T* R_t, // [H*4,H] const T* b, // [H*4] const T* x_t, // [C,N] const T* h, // [N,H] const T* c, // [N,H] const T* c_new, // [N,H] const T* dh_new, // [N,H] const T* dc_new, // [N,H] T* dx, // [N,C] T* dW, // [C,H*4] T* dR, // [H,H*4] T* db, // [H*4] T* dh, // [N,H] T* dc, // [N,H] T* v, // [N,H*4] const T* zoneout_mask) { const T alpha = static_cast(1.0); const T beta_sum = static_cast(1.0); // Accumulate into output matrix! const T beta_assign = static_cast(0.0); const blas::set_pointer_mode scoped1(data_->blas_handle); const int batch_size = data_->batch_size; const int input_size = data_->input_size; const int hidden_size = data_->hidden_size; const cublasHandle_t blas_handle = data_->blas_handle; const cudaStream_t stream1 = data_->stream[0]; const cudaStream_t stream2 = data_->stream[1]; const cudaStream_t stream3 = data_->stream[2]; const cudaEvent_t event = data_->event; cudaStream_t save_stream; cublasGetStream(blas_handle, &save_stream); // Make sure inputs are ready before using them. if (stream) { cudaEventRecord(event, stream); cudaStreamWaitEvent(stream1, event, 0); } IterateInternal( R_t, c, c_new, dh_new, dc_new, db, dh, dc, v, zoneout_mask); // Wait for pointwise operations to complete since there's a // data dependency between its output (`v`) and the following matmuls. cudaStreamWaitEvent(stream2, event, 0); cudaStreamWaitEvent(stream3, event, 0); cublasSetStream(blas_handle, stream2); blas::gemm(blas_handle, CUBLAS_OP_N, CUBLAS_OP_N, input_size, batch_size, hidden_size * 4, &alpha, W_t, input_size, v, hidden_size * 4, &beta_assign, dx, input_size); // We can get away with only waiting for the `dx` and `dh` outputs and // let the `dR` and `dW` matrices complete whenever they complete. It's // a little unsafe, but we make the assumption that callers won't have // upstream data-dependencies on those matrices. if (stream) { cudaEventRecord(event, stream2); cudaStreamWaitEvent(stream, event, 0); } cublasSetStream(blas_handle, stream3); blas::gemm(blas_handle, CUBLAS_OP_N, CUBLAS_OP_T, hidden_size * 4, hidden_size, batch_size, &alpha, v, hidden_size * 4, h, hidden_size, &beta_sum, dR, hidden_size * 4); cublasSetStream(blas_handle, stream3); blas::gemm(blas_handle, CUBLAS_OP_N, CUBLAS_OP_N, hidden_size * 4, input_size, batch_size, &alpha, v, hidden_size * 4, x_t, batch_size, &beta_sum, dW, hidden_size * 4); cublasSetStream(blas_handle, save_stream); } template void BackwardPass::IterateInternal( const T* R_t, // [H*4,H] const T* c, // [N,H] const T* c_new, // [N,H] const T* dh_new, // [N,H] const T* dc_new, // [N,H] T* db, // [H*4] T* dh, // [N,H] T* dc, // [N,H] T* v, // [N,H*4] const T* zoneout_mask) { const T alpha = static_cast(1.0); const T beta_sum = static_cast(1.0); // Accumulate into output matrix! const int batch_size = data_->batch_size; const int hidden_size = data_->hidden_size; const cublasHandle_t blas_handle = data_->blas_handle; const cudaStream_t stream1 = data_->stream[0]; const cudaEvent_t event = data_->event; // Compute launch configuration for pointwise operations kernel. const dim3 blockDim(64, 16); const dim3 gridDim( (hidden_size + blockDim.x - 1) / blockDim.x, (batch_size + blockDim.y - 1) / blockDim.y); if (zoneout_mask) { PointwiseOperations<<>>( batch_size, hidden_size, c, v, c_new, dh_new, dc_new, db, dh, dc, v, zoneout_mask ); } else { PointwiseOperations<<>>( batch_size, hidden_size, c, v, c_new, dh_new, dc_new, db, dh, dc, v, nullptr ); } // Signal completion of pointwise operations for data-dependent streams. cudaEventRecord(event, stream1); cublasSetStream(blas_handle, stream1); blas::gemm(blas_handle, CUBLAS_OP_N, CUBLAS_OP_N, hidden_size, batch_size, hidden_size * 4, &alpha, R_t, hidden_size, v, hidden_size * 4, &beta_sum, dh, hidden_size); } template void BackwardPass::Run( const int steps, const T* W_t, // [H*4,C] const T* R_t, // [H*4,H] const T* b, // [H*4] const T* x_t, // [C,T,N] const T* h, // [T+1,N,H] const T* c, // [T+1,N,H] const T* dh_new, // [T+1,N,H] const T* dc_new, // [T+1,N,H] T* dx, // [T,N,C] T* dW, // [C,H*4] T* dR, // [H,H*4] T* db, // [H*4] T* dh, // [N,H] T* dc, // [N,H] T* v, // [T,N,H*4] const T* zoneout_mask) { const T alpha = static_cast(1.0); const T beta_sum = static_cast(1.0); // Accumulate into output matrix! const T beta_assign = static_cast(0.0); const blas::set_pointer_mode scoped1(data_->blas_handle); const int batch_size = data_->batch_size; const int input_size = data_->input_size; const int hidden_size = data_->hidden_size; const cublasHandle_t blas_handle = data_->blas_handle; const cudaStream_t stream1 = data_->stream[0]; const cudaStream_t stream2 = data_->stream[1]; const cudaStream_t stream3 = data_->stream[2]; const cudaEvent_t event = data_->event; cudaStream_t save_stream; cublasGetStream(blas_handle, &save_stream); const int NH = batch_size * hidden_size; for (int i = steps - 1; i >= 0; --i) { IterateInternal( R_t, c + i * NH, c + (i + 1) * NH, dh_new + (i + 1) * NH, dc_new + (i + 1) * NH, db, dh, dc, v + i * NH * 4, zoneout_mask ? zoneout_mask + i * NH : nullptr); } cudaEventRecord(event, stream1); cudaStreamWaitEvent(stream2, event, 0); cublasSetStream(blas_handle, stream2); blas::gemm(blas_handle, CUBLAS_OP_N, CUBLAS_OP_N, hidden_size * 4, input_size, batch_size * steps, &alpha, v, hidden_size * 4, x_t, batch_size * steps, &beta_sum, dW, hidden_size * 4); cudaStreamWaitEvent(stream3, event, 0); cublasSetStream(blas_handle, stream1); blas::gemm(blas_handle, CUBLAS_OP_N, CUBLAS_OP_T, hidden_size * 4, hidden_size, batch_size * steps, &alpha, v, hidden_size * 4, h, hidden_size, &beta_sum, dR, hidden_size * 4); cublasSetStream(blas_handle, stream1); blas::gemm(blas_handle, CUBLAS_OP_N, CUBLAS_OP_N, input_size, steps * batch_size, hidden_size * 4, &alpha, W_t, input_size, v, hidden_size * 4, &beta_assign, dx, input_size); cublasSetStream(blas_handle, save_stream); } template struct BackwardPass; template struct BackwardPass; } // namespace lstm } // namespace v0 } // namespace haste ================================================ FILE: lib/lstm_forward_gpu.cu.cc ================================================ // Copyright 2020 LMNT, Inc. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // ============================================================================== #include #include #include "blas.h" #include "haste.h" #include "inline_ops.h" namespace { // `h` and `h_out` may be aliased. // `c` and `c_out` may be aliased. template __global__ void PointwiseOperations(const int batch_dim, const int hidden_dim, const T* Wx, // Precomputed (Wx) vector const T* Rh, // Precomputed (Rh) vector const T* b, // Bias for gates const T* h, // Input recurrent state const T* c, // Input cell state T* h_out, // Output recurrent state T* c_out, // Output cell state T* v_out, // Output vector v (Wx + Rh + b) (only used if Training==true) const float zoneout_prob, const T* zoneout_mask) { // Zoneout mask (only used if ApplyZoneout==true) // We're in column-major order here, so increase x => increase row. const int row = blockDim.x * blockIdx.x + threadIdx.x; const int col = blockDim.y * blockIdx.y + threadIdx.y; if (row >= hidden_dim || col >= batch_dim) return; // Base index into the Wx and Rh matrices. const int weight_idx = col * (hidden_dim * 4) + row; // Base index into the output matrix. This is different from `weight_idx` because // the number of rows are different between the two sets of matrices. const int output_idx = col * hidden_dim + row; const int i_idx = weight_idx + 0 * hidden_dim; const int g_idx = weight_idx + 1 * hidden_dim; const int f_idx = weight_idx + 2 * hidden_dim; const int o_idx = weight_idx + 3 * hidden_dim; const T i = sigmoid(Wx[i_idx] + Rh[i_idx] + b[row + 0 * hidden_dim]); const T g = tanh (Wx[g_idx] + Rh[g_idx] + b[row + 1 * hidden_dim]); const T f = sigmoid(Wx[f_idx] + Rh[f_idx] + b[row + 2 * hidden_dim]); const T o = sigmoid(Wx[o_idx] + Rh[o_idx] + b[row + 3 * hidden_dim]); // Compile-time constant branch should be eliminated by compiler so we have // straight-through code. if (Training) { v_out[i_idx] = i; v_out[g_idx] = g; v_out[f_idx] = f; v_out[o_idx] = o; } T cur_c_value = (f * c[output_idx]) + (i * g); T cur_h_value = o * tanh(cur_c_value); if (ApplyZoneout) { if (Training) { cur_h_value = (cur_h_value - h[output_idx]) * zoneout_mask[output_idx] + h[output_idx]; } else { cur_h_value = (zoneout_prob * h[output_idx]) + ((1.0f - zoneout_prob) * cur_h_value); } } c_out[output_idx] = cur_c_value; h_out[output_idx] = cur_h_value; } } // anonymous namespace namespace haste { namespace v0 { namespace lstm { template struct ForwardPass::private_data { bool training; int batch_size; int input_size; int hidden_size; cublasHandle_t blas_handle; cudaStream_t stream[2]; cudaEvent_t event; cudaEvent_t ready_event; cudaEvent_t finished_event; }; template ForwardPass::ForwardPass( const bool training, const int batch_size, const int input_size, const int hidden_size, const cublasHandle_t& blas_handle, const cudaStream_t& stream) : data_(new private_data) { data_->training = training; data_->batch_size = batch_size; data_->input_size = input_size; data_->hidden_size = hidden_size; data_->blas_handle = blas_handle; cudaStreamCreate(&data_->stream[0]); cudaStreamCreate(&data_->stream[1]); cudaEventCreateWithFlags(&data_->event, cudaEventDisableTiming); cudaEventCreateWithFlags(&data_->ready_event, cudaEventDisableTiming); cudaEventCreateWithFlags(&data_->finished_event, cudaEventDisableTiming); } template ForwardPass::~ForwardPass() { cudaStreamSynchronize(data_->stream[1]); cudaStreamSynchronize(data_->stream[0]); cudaEventDestroy(data_->finished_event); cudaEventDestroy(data_->ready_event); cudaEventDestroy(data_->event); cudaStreamDestroy(data_->stream[1]); cudaStreamDestroy(data_->stream[0]); delete data_; } template void ForwardPass::Iterate( const cudaStream_t& stream, const T* W, // Weight matrix for input (Wx) [C,H*4] const T* R, // Weight matrix for recurrent state (Rh) [H,H*4] const T* b, // Bias for gates (Wx + Rh + b) [H*4] const T* x, // Input vector [N,C] const T* h, // Recurrent state [N,H] const T* c, // Cell state [N,H] T* h_out, // Output recurrent state [N,H] T* c_out, // Output cell state [N,H] T* v, // Output vector (Wx + Rh + b) [N,H*4] T* tmp_Rh, // Temporary storage for Rh vector [N,H*4] const float zoneout_prob, const T* zoneout_mask) { // Zoneout mask [N,H] // Constants for GEMM static const T alpha = static_cast(1.0); static const T beta = static_cast(0.0); const blas::set_pointer_mode scoped1(data_->blas_handle); const int batch_size = data_->batch_size; const int input_size = data_->input_size; const int hidden_size = data_->hidden_size; const cublasHandle_t blas_handle = data_->blas_handle; const cudaStream_t stream1 = data_->stream[0]; const cudaStream_t stream2 = data_->stream[1]; const cudaEvent_t event = data_->event; cudaStream_t save_stream; cublasGetStream(blas_handle, &save_stream); // Make sure inputs are ready before we use them. if (stream) { cudaEventRecord(event, stream); cudaStreamWaitEvent(stream2, event, 0); } cublasSetStream(blas_handle, stream2); blas::gemm(blas_handle, CUBLAS_OP_N, CUBLAS_OP_N, hidden_size * 4, batch_size, input_size, &alpha, W, hidden_size * 4, x, input_size, &beta, v, hidden_size * 4); cudaEventRecord(event, stream2); IterateInternal( R, b, h, c, h_out, c_out, v, tmp_Rh, zoneout_prob, zoneout_mask); // Make sure outputs have settled. if (stream) { cudaEventRecord(event, stream1); cudaStreamWaitEvent(stream, event, 0); } cublasSetStream(blas_handle, save_stream); } template void ForwardPass::IterateInternal( const T* R, // Weight matrix for recurrent state (Rh) [H,H*4] const T* b, // Bias for gates (Wx + Rh + b) [H*4] const T* h, // Recurrent state [N,H] const T* c, // Cell state [N,H] T* h_out, // Output recurrent state [N,H] T* c_out, // Output cell state [N,H] T* v, // Output vector (Wx + Rh + b) [N,H*4] T* tmp_Rh, // Temporary storage for Rh vector [N,H*4] const float zoneout_prob, const T* zoneout_mask) { // Zoneout mask [N,H] static const T alpha = static_cast(1.0); static const T beta = static_cast(0.0); const bool training = data_->training; const int batch_size = data_->batch_size; const int hidden_size = data_->hidden_size; const cublasHandle_t blas_handle = data_->blas_handle; const cudaStream_t stream1 = data_->stream[0]; const cudaEvent_t event = data_->event; cublasSetStream(blas_handle, stream1); blas::gemm(blas_handle, CUBLAS_OP_N, CUBLAS_OP_N, hidden_size * 4, batch_size, hidden_size, &alpha, R, hidden_size * 4, h, hidden_size, &beta, tmp_Rh, hidden_size * 4); cudaStreamWaitEvent(stream1, event, 0); // Compute launch configuration for pointwise operations kernel. const dim3 blockDim(64, 16); const dim3 gridDim( (hidden_size + blockDim.x - 1) / blockDim.x, (batch_size + blockDim.y - 1) / blockDim.y); if (training) { if (zoneout_prob && zoneout_mask) { PointwiseOperations<<>>( batch_size, hidden_size, v, tmp_Rh, b, h, c, h_out, c_out, v, zoneout_prob, zoneout_mask); } else { PointwiseOperations<<>>( batch_size, hidden_size, v, tmp_Rh, b, h, c, h_out, c_out, v, 0.0f, nullptr); } } else { if (zoneout_prob && zoneout_mask) { PointwiseOperations<<>>( batch_size, hidden_size, v, tmp_Rh, b, h, c, h_out, c_out, nullptr, zoneout_prob, zoneout_mask); } else { PointwiseOperations<<>>( batch_size, hidden_size, v, tmp_Rh, b, h, c, h_out, c_out, nullptr, 0.0f, nullptr); } } } template void ForwardPass::Run( const int steps, const T* W, // Weight matrix for input (Wx) [C,H*4] const T* R, // Weight matrix for recurrent state (Rh) [H,H*4] const T* b, // Bias for gates (Wx + Rh + b) [H*4] const T* x, // Input vector [T,N,C] T* h, // Recurrent state [T+1,N,H] T* c, // Cell state [T+1,N,H] T* v, // Output vector (Wx + Rh + b) [T,N,H*4] T* tmp_Rh, // Temporary storage for Rh vector [N,H*4] const float zoneout_prob, const T* zoneout_mask) { // Zoneout mask [T,N,H] static const T alpha = static_cast(1.0); static const T beta = static_cast(0.0); const blas::set_pointer_mode scoped1(data_->blas_handle); const int batch_size = data_->batch_size; const int input_size = data_->input_size; const int hidden_size = data_->hidden_size; const cublasHandle_t blas_handle = data_->blas_handle; const cudaStream_t stream1 = data_->stream[0]; cudaStream_t save_stream; cublasGetStream(blas_handle, &save_stream); cublasSetStream(blas_handle, stream1); blas::gemm(blas_handle, CUBLAS_OP_N, CUBLAS_OP_N, hidden_size * 4, steps * batch_size, input_size, &alpha, W, hidden_size * 4, x, input_size, &beta, v, hidden_size * 4); for (int i = 0; i < steps; ++i) { const int NH = batch_size * hidden_size; IterateInternal( R, b, h + i * NH, c + i * NH, h + (i + 1) * NH, c + (i + 1) * NH, v + i * NH * 4, tmp_Rh, zoneout_prob, zoneout_mask ? zoneout_mask + i * NH : nullptr); } cublasSetStream(blas_handle, save_stream); } template struct ForwardPass; template struct ForwardPass; } // namespace lstm } // namespace v0 } // namespace haste ================================================ FILE: validation/pytorch.py ================================================ # Copyright 2020 LMNT, Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== import argparse from unittest import mock import torch import haste_pytorch as haste RNN_MAP = { 'gru': haste.GRU, 'indrnn': haste.IndRNN, 'layer_norm_gru': haste.LayerNormGRU, 'layer_norm_indrnn': haste.LayerNormIndRNN, 'layer_norm_lstm': haste.LayerNormLSTM, 'lstm': haste.LSTM, } HASTE_TO_NATIVE = { haste.GRU: torch.nn.GRU, haste.LSTM: torch.nn.LSTM, } batch_size = 32 time_steps = 250 input_size = 128 hidden_size = 256 def self_consistency(rnn, x): x_cuda = x.clone().cuda() x_cuda_torch = x_cuda.detach().clone() x_cuda.requires_grad_(True) x_cuda_torch.requires_grad_(True) rnn.cuda() seed = 5566 torch.manual_seed(seed) y1, _ = rnn.forward(x_cuda) y1.backward(torch.ones_like(y1)) torch.manual_seed(seed) with mock.patch.object(rnn, "_is_cuda", lambda: False): y2, _ = rnn.forward(x_cuda_torch) y2.backward(torch.ones_like(y2)) g1 = x_cuda_torch.grad.data g2 = x_cuda.grad.data print(torch.max(torch.abs(y1.cpu()-y2.cpu()))) print(torch.max(torch.abs(g1.cpu()-g2.cpu()))) def native_consistency(haste_rnn, pytorch_rnn, x): pytorch_rnn.cuda() haste_rnn.cuda() haste_rnn.from_native_weights( pytorch_rnn.weight_ih_l0, pytorch_rnn.weight_hh_l0, pytorch_rnn.bias_ih_l0, pytorch_rnn.bias_hh_l0) x1 = x.clone().cuda() x2 = x.clone().cuda() x1.requires_grad_(True) x2.requires_grad_(True) y1, _ = haste_rnn.forward(x1) y1.backward(torch.ones_like(y1)) y2, _ = pytorch_rnn.forward(x2) y2.backward(torch.ones_like(y2)) g1 = x1.grad.data g2 = x2.grad.data print(torch.max(torch.abs(y1-y2))) print(torch.max(torch.abs(g1-g2))) def _run_rnn(rnn_type, x, **kwargs): rnn = rnn_type(input_size, hidden_size, **kwargs) self_consistency(rnn, x) if rnn_type in HASTE_TO_NATIVE and not kwargs: pytorch_rnn = HASTE_TO_NATIVE[rnn_type](input_size, hidden_size) native_consistency(rnn, pytorch_rnn, x) def run_rnn(rnn_type, x): for kwargs in [dict(), dict(zoneout=0.5)]: _run_rnn(rnn_type, x, **kwargs) def main(args): x = torch.rand(time_steps, batch_size, input_size) if args.rnn_type == 'all': for type_name, rnn_type in RNN_MAP.items(): print(f'[{type_name}]') run_rnn(rnn_type, x) print('') else: print(f'[{args.rnn_type}]') rnn_type = RNN_MAP[args.rnn_type] rnn = run_rnn(rnn_type, x) if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument( 'rnn_type', nargs='?', default='all', choices=list(RNN_MAP.keys()) + ['all']) main(parser.parse_args()) ================================================ FILE: validation/pytorch_speed.py ================================================ import torch import haste_pytorch as haste from time import time seq_len = 2500 batch_size = 64 input_size = 256 hidden_size = 4096 rnn = haste.IndRNN(input_size, hidden_size).cuda() x = torch.rand(seq_len, batch_size, input_size).cuda() start = time() for _ in range(10): y, _ = rnn(x) y.backward(torch.ones_like(y)) end = time() print(f'{end-start}') ================================================ FILE: validation/tf.py ================================================ # Copyright 2020 LMNT, Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== import argparse import haste_tf as haste import tensorflow as tf def stfu(): import os os.environ['TF_CPP_MIN_LOG_LEVEL'] = '4' tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR) def NativeGRUBuilder(hidden_size): return tf.keras.layers.GRU( hidden_size, implementation=2, activation='tanh', recurrent_activation='sigmoid', return_sequences=True, reset_after=True) def NativeLSTMBuilder(hidden_size): return tf.keras.layers.LSTM( hidden_size, implementation=2, activation='tanh', unit_forget_bias=False, recurrent_activation='sigmoid', return_sequences=True) def NativeGRUWeights(native_gru, haste_gru): weights = haste_gru.fw_layer.get_weights() native_gru.variables[0].assign(weights['kernel']) native_gru.variables[1].assign(weights['recurrent_kernel']) native_gru.variables[2].assign(tf.stack([weights['bias'], weights['recurrent_bias']], axis=0)) def NativeLSTMWeights(native_lstm, haste_lstm): def swapple(x): i, g, f, o = tf.split(x, 4, axis=-1) return tf.concat([i, f, g, o], axis=-1) weights = haste_lstm.fw_layer.get_weights() native_lstm.variables[0].assign(swapple(weights['kernel'])) native_lstm.variables[1].assign(swapple(weights['recurrent_kernel'])) native_lstm.variables[2].assign(swapple(weights['bias'])) RNN_MAP = { 'gru': haste.GRU, 'indrnn': haste.IndRNN, 'layer_norm_gru': haste.LayerNormGRU, 'layer_norm_lstm': haste.LayerNormLSTM, 'lstm': haste.LSTM, } HASTE_TO_NATIVE = { haste.GRU: NativeGRUBuilder, haste.LSTM: NativeLSTMBuilder, } HASTE_TO_NATIVE_WEIGHTS = { haste.GRU: NativeGRUWeights, haste.LSTM: NativeLSTMWeights, } batch_size = 32 time_steps = 250 input_size = 128 hidden_size = 256 def native_consistency(haste_rnn, native_rnn, x): with tf.GradientTape() as tape: tape.watch(x) y1, _ = haste_rnn(x, training=True) g1 = tape.gradient(y1, x) native_rnn.build(x.shape) HASTE_TO_NATIVE_WEIGHTS[type(haste_rnn)](native_rnn, haste_rnn) with tf.GradientTape() as tape: tape.watch(x) y2 = native_rnn(x, training=True) g2 = tape.gradient(y2, x) print(tf.reduce_max(tf.abs(y2-y1))) print(tf.reduce_max(tf.abs(g2-g1))) def run_rnn(rnn_type, x): rnn = rnn_type(hidden_size) if rnn_type in HASTE_TO_NATIVE: native_rnn = HASTE_TO_NATIVE[rnn_type](hidden_size) native_consistency(rnn, native_rnn, x) def main(args): tf.compat.v1.enable_eager_execution() stfu() x = tf.random.normal([batch_size, time_steps, input_size]) if args.rnn_type == 'all': for type_name, rnn_type in RNN_MAP.items(): print(f'[{type_name}]') run_rnn(rnn_type, x) print('') else: print(f'[{args.rnn_type}]') rnn_type = RNN_MAP[args.rnn_type] rnn = run_rnn(rnn_type, x) if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument( 'rnn_type', nargs='?', default='all', choices=list(RNN_MAP.keys()) + ['all']) main(parser.parse_args()) ================================================ FILE: validation/tf_pytorch.py ================================================ # Copyright 2020 LMNT, Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== import argparse import numpy as np import tensorflow as tf import haste_tf import torch import torch.nn as nn import haste_pytorch def stfu(): import os os.environ['TF_CPP_MIN_LOG_LEVEL'] = '4' tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR) def copy_weights_gru(rnn_tf, rnn_pt): weights = rnn_tf.fw_layer.get_weights() kernel = torch.Tensor(weights['kernel'].numpy()) recurrent_kernel = torch.Tensor(weights['recurrent_kernel'].numpy()) bias = torch.Tensor(weights['bias'].numpy()) recurrent_bias = torch.Tensor(weights['recurrent_bias'].numpy()) rnn_pt.kernel = nn.Parameter(kernel) rnn_pt.recurrent_kernel = nn.Parameter(recurrent_kernel) rnn_pt.bias = nn.Parameter(bias) rnn_pt.recurrent_bias = nn.Parameter(recurrent_bias) def copy_weights_indrnn(rnn_tf, rnn_pt): weights = rnn_tf.fw_layer.get_weights() kernel = torch.Tensor(weights['kernel'].numpy()) recurrent_scale = torch.Tensor(weights['recurrent_scale'].numpy()) bias = torch.Tensor(weights['bias'].numpy()) rnn_pt.kernel = nn.Parameter(kernel) rnn_pt.recurrent_scale = nn.Parameter(recurrent_scale) rnn_pt.bias = nn.Parameter(bias) def copy_weights_layer_norm_gru(rnn_tf, rnn_pt): weights = rnn_tf.fw_layer.get_weights() kernel = torch.Tensor(weights['kernel'].numpy()) recurrent_kernel = torch.Tensor(weights['recurrent_kernel'].numpy()) bias = torch.Tensor(weights['bias'].numpy()) recurrent_bias = torch.Tensor(weights['recurrent_bias'].numpy()) gamma = torch.Tensor(weights['gamma'].numpy()) rnn_pt.kernel = nn.Parameter(kernel) rnn_pt.recurrent_kernel = nn.Parameter(recurrent_kernel) rnn_pt.bias = nn.Parameter(bias) rnn_pt.recurrent_bias = nn.Parameter(recurrent_bias) rnn_pt.gamma = nn.Parameter(gamma) def copy_weights_layer_norm_indrnn(rnn_tf, rnn_pt): weights = rnn_tf.fw_layer.get_weights() kernel = torch.Tensor(weights['kernel'].numpy()) recurrent_scale = torch.Tensor(weights['recurrent_scale'].numpy()) bias = torch.Tensor(weights['bias'].numpy()) gamma = torch.Tensor(weights['gamma'].numpy()) rnn_pt.kernel = nn.Parameter(kernel) rnn_pt.recurrent_scale = nn.Parameter(recurrent_scale) rnn_pt.bias = nn.Parameter(bias) rnn_pt.gamma = nn.Parameter(gamma) def copy_weights_layer_norm_lstm(rnn_tf, rnn_pt): weights = rnn_tf.fw_layer.get_weights() kernel = torch.Tensor(weights['kernel'].numpy()) recurrent_kernel = torch.Tensor(weights['recurrent_kernel'].numpy()) bias = torch.Tensor(weights['bias'].numpy()) gamma = torch.Tensor(weights['gamma'].numpy()) gamma_h = torch.Tensor(weights['gamma_h'].numpy()) beta_h = torch.Tensor(weights['beta_h'].numpy()) rnn_pt.kernel = nn.Parameter(kernel) rnn_pt.recurrent_kernel = nn.Parameter(recurrent_kernel) rnn_pt.bias = nn.Parameter(bias) rnn_pt.gamma = nn.Parameter(gamma) rnn_pt.gamma_h = nn.Parameter(gamma_h) rnn_pt.beta_h = nn.Parameter(beta_h) def copy_weights_lstm(rnn_tf, rnn_pt): weights = rnn_tf.fw_layer.get_weights() kernel = torch.Tensor(weights['kernel'].numpy()) recurrent_kernel = torch.Tensor(weights['recurrent_kernel'].numpy()) bias = torch.Tensor(weights['bias'].numpy()) rnn_pt.kernel = nn.Parameter(kernel) rnn_pt.recurrent_kernel = nn.Parameter(recurrent_kernel) rnn_pt.bias = nn.Parameter(bias) batch_size = 32 time_steps = 250 input_size = 128 hidden_size = 256 RNN_MAP = { 'gru': haste_tf.GRU, 'indrnn': haste_tf.IndRNN, 'layer_norm_gru': haste_tf.LayerNormGRU, 'layer_norm_indrnn': haste_tf.LayerNormIndRNN, 'layer_norm_lstm': haste_tf.LayerNormLSTM, 'lstm': haste_tf.LSTM, } TF_TO_PT = { haste_tf.GRU: haste_pytorch.GRU, haste_tf.IndRNN: haste_pytorch.IndRNN, haste_tf.LayerNormGRU: haste_pytorch.LayerNormGRU, haste_tf.LayerNormIndRNN: haste_pytorch.LayerNormIndRNN, haste_tf.LayerNormLSTM: haste_pytorch.LayerNormLSTM, haste_tf.LSTM: haste_pytorch.LSTM, } WEIGHT_COPY_MAP = { haste_tf.GRU: copy_weights_gru, haste_tf.IndRNN: copy_weights_indrnn, haste_tf.LayerNormGRU: copy_weights_layer_norm_gru, haste_tf.LayerNormIndRNN: copy_weights_layer_norm_indrnn, haste_tf.LayerNormLSTM: copy_weights_layer_norm_lstm, haste_tf.LSTM: copy_weights_lstm, } def run_rnn(rnn_type, x): rnn_tf = rnn_type(hidden_size) rnn_pt = TF_TO_PT[rnn_type](input_size, hidden_size, batch_first=True) rnn_tf.build(x.shape) WEIGHT_COPY_MAP[type(rnn_tf)](rnn_tf, rnn_pt) x1 = tf.convert_to_tensor(x) x2 = torch.Tensor(x) x2.requires_grad_(True) with tf.GradientTape() as tape: tape.watch(x1) y1, _ = rnn_tf(x1, training=True) g1 = tape.gradient(y1, x1) y2, _ = rnn_pt(x2) y2.backward(torch.ones_like(y2)) print(np.amax(np.abs(y1.numpy() - y2.detach().numpy()))) print(np.amax(np.abs(g1.numpy() - x2.grad.data.numpy()))) def main(args): tf.compat.v1.enable_eager_execution() stfu() x = np.random.normal(size=[time_steps, batch_size, input_size]).astype(np.float32) if args.rnn_type == 'all': for type_name, rnn_type in RNN_MAP.items(): print(f'[{type_name}]') run_rnn(rnn_type, x) print('') else: print(f'[{args.rnn_type}]') rnn_type = RNN_MAP[args.rnn_type] rnn = run_rnn(rnn_type, x) if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument( 'rnn_type', nargs='?', default='all', choices=list(RNN_MAP.keys()) + ['all']) main(parser.parse_args())