Full Code of apache/mxnet for AI

master b84609d3fc73 cached
2643 files
28.3 MB
4.1M tokens
7970 symbols
1 requests
Copy disabled (too large) Download .txt
Showing preview only (16,273K chars total). Download the full file to get everything.
Repository: apache/mxnet
Branch: master
Commit: b84609d3fc73
Files: 2643
Total size: 28.3 MB

Directory structure:
gitextract_zlms863u/

├── .asf.yaml
├── .clang-format
├── .clang-tidy
├── .cmakelintrc
├── .codecov.yml
├── .git-blame-ignore-revs
├── .gitattributes
├── .github/
│   ├── ISSUE_TEMPLATE/
│   │   ├── bug_report.md
│   │   ├── config.yml
│   │   ├── feature_request.md
│   │   ├── flaky_test.md
│   │   └── rfc.md
│   ├── PULL_REQUEST_TEMPLATE.md
│   └── workflows/
│       ├── greetings.yml
│       ├── license_check.yml
│       ├── link_check.yml
│       ├── os_x_mklbuild.yml
│       └── os_x_staticbuild.yml
├── .gitignore
├── .gitmodules
├── .licenserc.yaml
├── .mxnet_root
├── 3rdparty/
│   ├── ctc_include/
│   │   ├── LICENSE
│   │   ├── contrib/
│   │   │   └── moderngpu/
│   │   │       ├── LICENSE
│   │   │       └── include/
│   │   │           ├── device/
│   │   │           │   ├── ctaloadbalance.cuh
│   │   │           │   ├── ctamerge.cuh
│   │   │           │   ├── ctascan.cuh
│   │   │           │   ├── ctasearch.cuh
│   │   │           │   ├── ctasegreduce.cuh
│   │   │           │   ├── ctasegscan.cuh
│   │   │           │   ├── ctasegsort.cuh
│   │   │           │   ├── ctasortedsearch.cuh
│   │   │           │   ├── devicetypes.cuh
│   │   │           │   ├── deviceutil.cuh
│   │   │           │   ├── intrinsics.cuh
│   │   │           │   ├── loadstore.cuh
│   │   │           │   ├── serialsets.cuh
│   │   │           │   └── sortnetwork.cuh
│   │   │           ├── mgpudevice.cuh
│   │   │           ├── mgpuenums.h
│   │   │           └── util/
│   │   │               └── static.h
│   │   └── detail/
│   │       ├── cpu_ctc.h
│   │       ├── ctc_helper.h
│   │       ├── gpu_ctc.h
│   │       ├── gpu_ctc_kernels.h
│   │       └── hostdevice.h
│   ├── miniz/
│   │   ├── miniz.c
│   │   └── miniz.h
│   └── mshadow/
│       ├── .gitignore
│       ├── .travis.yml
│       ├── CHANGES.md
│       ├── CMakeLists.txt
│       ├── LICENSE
│       ├── README.md
│       ├── cmake/
│       │   └── AutoDetectF16C.cmake
│       ├── doc/
│       │   ├── Doxyfile
│       │   ├── README.md
│       │   └── mkdoc.sh
│       ├── guide/
│       │   ├── .gitignore
│       │   ├── Makefile
│       │   ├── README.md
│       │   ├── basic.cpp
│       │   ├── basic_stream.cu
│       │   ├── defop.cpp
│       │   ├── exp-template/
│       │   │   ├── .gitignore
│       │   │   ├── Makefile
│       │   │   └── README.md
│       │   ├── mshadow-ps/
│       │   │   ├── .gitignore
│       │   │   ├── Makefile
│       │   │   ├── README.md
│       │   │   ├── dbstr.h
│       │   │   ├── dist_async_sum-inl.h
│       │   │   ├── dist_async_sum.cpp
│       │   │   ├── local.sh
│       │   │   ├── local_sum-inl.h
│       │   │   ├── local_sum.cpp
│       │   │   └── local_sum.cu
│       │   └── neuralnet/
│       │       ├── Makefile
│       │       ├── README.md
│       │       ├── convnet.cu
│       │       ├── nnet.cu
│       │       ├── nnet_ps.cu
│       │       └── util.h
│       ├── make/
│       │   ├── README.md
│       │   └── mshadow.mk
│       ├── mshadow/
│       │   ├── README.md
│       │   ├── base.h
│       │   ├── bfloat.h
│       │   ├── cuda/
│       │   │   ├── reduce.cuh
│       │   │   └── tensor_gpu-inl.cuh
│       │   ├── dot_engine-inl.h
│       │   ├── expr_engine-inl.h
│       │   ├── expr_scalar-inl.h
│       │   ├── expression.h
│       │   ├── extension/
│       │   │   ├── broadcast.h
│       │   │   ├── broadcast_with_axis.h
│       │   │   ├── channel_pool.h
│       │   │   ├── channel_unpool.h
│       │   │   ├── choose.h
│       │   │   ├── complex.h
│       │   │   ├── concat.h
│       │   │   ├── crop.h
│       │   │   ├── fill.h
│       │   │   ├── flip.h
│       │   │   ├── implicit_gemm.h
│       │   │   ├── mask.h
│       │   │   ├── mirror.h
│       │   │   ├── one_hot.h
│       │   │   ├── pack_col2patch.h
│       │   │   ├── pad.h
│       │   │   ├── range.h
│       │   │   ├── reduce_with_axis.h
│       │   │   ├── reduceto1d.h
│       │   │   ├── reshape.h
│       │   │   ├── slice.h
│       │   │   ├── slice_ex.h
│       │   │   ├── spatial_pool.h
│       │   │   ├── spatial_unpool.h
│       │   │   ├── spatial_upsampling_nearest.h
│       │   │   ├── swapaxis.h
│       │   │   ├── take.h
│       │   │   ├── take_grad.h
│       │   │   ├── transpose.h
│       │   │   └── unpack_patch2col.h
│       │   ├── extension.h
│       │   ├── half.h
│       │   ├── io.h
│       │   ├── packet/
│       │   │   ├── plain-inl.h
│       │   │   └── sse-inl.h
│       │   ├── packet-inl.h
│       │   ├── random.h
│       │   ├── stream_gpu-inl.h
│       │   ├── tensor.h
│       │   ├── tensor_container.h
│       │   ├── tensor_cpu-inl.h
│       │   └── tensor_gpu-inl.h
│       ├── mshadow-ps/
│       │   ├── .gitignore
│       │   ├── README.md
│       │   ├── mshadow_ps.h
│       │   ├── ps_dist-inl.h
│       │   ├── ps_local-inl.h
│       │   ├── ps_rabit-inl.h
│       │   ├── thread.h
│       │   └── thread_util.h
│       ├── scripts/
│       │   └── travis_script.sh
│       └── test/
│           ├── Makefile
│           ├── pairtest.cu
│           ├── pool.cu
│           ├── reshape.cu
│           ├── test.cu
│           ├── test.h
│           └── unpack.cu
├── CMakeLists.txt
├── CODEOWNERS
├── CODE_OF_CONDUCT.md
├── CONTRIBUTORS.md
├── DNNL_README.md
├── LICENSE
├── NEWS.md
├── NOTICE
├── README.md
├── SECURITY.md
├── benchmark/
│   ├── __init__.py
│   ├── opperf/
│   │   ├── README.md
│   │   ├── __init__.py
│   │   ├── custom_operations/
│   │   │   ├── __init__.py
│   │   │   └── custom_operations.py
│   │   ├── nd_operations/
│   │   │   ├── README.md
│   │   │   ├── __init__.py
│   │   │   ├── array_manipulation_operators.py
│   │   │   ├── array_rearrange.py
│   │   │   ├── binary_operators.py
│   │   │   ├── gemm_operators.py
│   │   │   ├── indexing_routines.py
│   │   │   ├── linalg_operators.py
│   │   │   ├── misc_operators.py
│   │   │   ├── nn_activation_operators.py
│   │   │   ├── nn_basic_operators.py
│   │   │   ├── nn_conv_operators.py
│   │   │   ├── nn_loss_operators.py
│   │   │   ├── nn_optimizer_operators.py
│   │   │   ├── random_sampling_operators.py
│   │   │   ├── reduction_operators.py
│   │   │   ├── sorting_searching_operators.py
│   │   │   └── unary_operators.py
│   │   ├── opperf.py
│   │   ├── results/
│   │   │   ├── mxnet_operator_benchmark_results_cpu.md
│   │   │   └── mxnet_operator_benchmark_results_gpu.md
│   │   ├── rules/
│   │   │   ├── __init__.py
│   │   │   └── default_params.py
│   │   └── utils/
│   │       ├── __init__.py
│   │       ├── benchmark_operators_pytest.py
│   │       ├── benchmark_utils.py
│   │       ├── common_utils.py
│   │       ├── ndarray_utils.py
│   │       ├── op_registry_utils.py
│   │       └── profiler_utils.py
│   └── python/
│       ├── control_flow/
│       │   └── rnn.py
│       ├── dnnl/
│       │   ├── fc_add.py
│       │   ├── run.sh
│       │   └── run_per_thread.sh
│       ├── einsum/
│       │   └── benchmark_einsum.py
│       ├── ffi/
│       │   └── benchmark_ffi.py
│       ├── metric/
│       │   └── benchmark_metric.py
│       ├── quantization/
│       │   └── benchmark_op.py
│       ├── sparse/
│       │   ├── cast_storage.py
│       │   ├── dot.py
│       │   ├── memory_benchmark.py
│       │   ├── sparse_op.py
│       │   ├── updater.py
│       │   └── util.py
│       └── tvmop/
│           └── benchmark_tvmop.py
├── cd/
│   ├── Jenkinsfile_cd_pipeline
│   ├── Jenkinsfile_release_job
│   ├── Jenkinsfile_utils.groovy
│   ├── README.md
│   ├── mxnet_lib/
│   │   ├── Jenkins_pipeline.groovy
│   │   └── mxnet_lib_pipeline.groovy
│   ├── python/
│   │   ├── docker/
│   │   │   ├── Dockerfile
│   │   │   ├── Dockerfile.test
│   │   │   ├── Jenkins_pipeline.groovy
│   │   │   ├── python_images.sh
│   │   │   └── test_python_image.sh
│   │   └── pypi/
│   │       ├── Jenkins_pipeline.groovy
│   │       ├── README.md
│   │       ├── pypi_package.sh
│   │       └── pypi_publish.py
│   └── utils/
│       ├── artifact_repository.md
│       ├── artifact_repository.py
│       ├── docker_tag.sh
│       ├── mxnet_base_image.sh
│       └── test_artifact_repository.py
├── ci/
│   ├── Jenkinsfile_docker_cache
│   ├── Jenkinsfile_utils.groovy
│   ├── README.md
│   ├── __init__.py
│   ├── build.py
│   ├── build_windows.py
│   ├── dev_menu.py
│   ├── docker/
│   │   ├── Dockerfile.build.android
│   │   ├── Dockerfile.build.arm
│   │   ├── Dockerfile.build.centos7
│   │   ├── Dockerfile.build.jetson
│   │   ├── Dockerfile.build.ubuntu
│   │   ├── Dockerfile.build.ubuntu_cpu_jekyll
│   │   ├── Dockerfile.publish.test.centos7
│   │   ├── Dockerfile.test.arm
│   │   ├── docker-compose.yml
│   │   ├── install/
│   │   │   ├── deb_ubuntu_ccache.sh
│   │   │   ├── docker_filepermissions.sh
│   │   │   ├── requirements
│   │   │   └── ubuntu_adduser.sh
│   │   ├── runtime_functions.sh
│   │   └── toolchains/
│   │       ├── aarch64-linux-gnu-toolchain.cmake
│   │       └── arm-linux-gnueabihf-toolchain.cmake
│   ├── docker_login.py
│   ├── jenkins/
│   │   ├── Jenkins_steps.groovy
│   │   ├── Jenkinsfile_centos_cpu
│   │   ├── Jenkinsfile_centos_gpu
│   │   ├── Jenkinsfile_clang
│   │   ├── Jenkinsfile_edge
│   │   ├── Jenkinsfile_full
│   │   ├── Jenkinsfile_miscellaneous
│   │   ├── Jenkinsfile_sanity
│   │   ├── Jenkinsfile_tools
│   │   ├── Jenkinsfile_unix_cpu
│   │   ├── Jenkinsfile_unix_gpu
│   │   ├── Jenkinsfile_website_beta
│   │   ├── Jenkinsfile_website_full
│   │   ├── Jenkinsfile_website_full_pr
│   │   ├── Jenkinsfile_website_jekyll_docs
│   │   ├── Jenkinsfile_website_mxnet_build
│   │   ├── Jenkinsfile_website_nightly
│   │   ├── Jenkinsfile_website_python_docs
│   │   ├── Jenkinsfile_website_version_artifacts
│   │   ├── Jenkinsfile_windows_cpu
│   │   └── Jenkinsfile_windows_gpu
│   ├── logging.conf
│   ├── other/
│   │   └── ci_deploy_doc.sh
│   ├── publish/
│   │   ├── Jenkinsfile
│   │   ├── README.md
│   │   ├── python/
│   │   │   └── build.sh
│   │   ├── scala/
│   │   │   ├── build.sh
│   │   │   ├── buildkey.py
│   │   │   ├── deploy.sh
│   │   │   ├── fullDeploy.sh
│   │   │   └── test.sh
│   │   └── website/
│   │       ├── README.md
│   │       ├── beta-deploy.sh
│   │       ├── deploy.sh
│   │       └── publish_artifacts.sh
│   ├── test_docker_login.py
│   ├── util.py
│   └── windows/
│       ├── test_py3_cpu.ps1
│       └── test_py3_gpu.ps1
├── cmake/
│   ├── BuildCythonModules.cmake
│   ├── BuildTVM.cmake
│   ├── ChooseBlas.cmake
│   ├── Modules/
│   │   ├── FindAccelerate.cmake
│   │   ├── FindAtlas.cmake
│   │   ├── FindCUDNN.cmake
│   │   ├── FindCUTENSOR.cmake
│   │   ├── FindGperftools.cmake
│   │   ├── FindJeMalloc.cmake
│   │   ├── FindNCCL.cmake
│   │   ├── FindNVML.cmake
│   │   ├── FindNVTX.cmake
│   │   └── FindOpenBLAS.cmake
│   ├── Utils.cmake
│   ├── libmxnet.sym
│   └── upstream/
│       ├── FindBLAS.cmake
│       ├── FindCUDAToolkit.cmake
│       └── select_compute_arch.cmake
├── config/
│   ├── darwin.cmake
│   ├── distribution/
│   │   ├── darwin_cpu.cmake
│   │   ├── darwin_cpu_mkl.cmake
│   │   ├── darwin_native.cmake
│   │   ├── linux_cpu.cmake
│   │   ├── linux_cpu_mkl.cmake
│   │   ├── linux_cu100.cmake
│   │   ├── linux_cu101.cmake
│   │   ├── linux_cu102.cmake
│   │   ├── linux_cu110.cmake
│   │   ├── linux_cu112.cmake
│   │   ├── linux_cu92.cmake
│   │   └── linux_native.cmake
│   ├── linux.cmake
│   └── linux_gpu.cmake
├── conftest.py
├── contrib/
│   └── tvmop/
│       ├── __init__.py
│       ├── basic/
│       │   ├── __init__.py
│       │   └── ufunc.py
│       ├── compile.py
│       ├── core/
│       │   ├── __init__.py
│       │   ├── fromnumeric.py
│       │   ├── multiarray.py
│       │   └── umath.py
│       ├── opdef.py
│       ├── space.py
│       └── utils.py
├── cpp-package/
│   ├── CMakeLists.txt
│   ├── README.md
│   ├── example/
│   │   ├── CMakeLists.txt
│   │   ├── README.md
│   │   ├── alexnet.cpp
│   │   ├── charRNN.cpp
│   │   ├── feature_extract/
│   │   │   ├── README.md
│   │   │   ├── feature_extract.cpp
│   │   │   ├── prepare_data_with_opencv.cpp
│   │   │   └── run.sh
│   │   ├── get_data.sh
│   │   ├── googlenet.cpp
│   │   ├── inception_bn.cpp
│   │   ├── inference/
│   │   │   ├── CMakeLists.txt
│   │   │   ├── README.md
│   │   │   ├── imagenet_inference.cpp
│   │   │   ├── multi_threaded_inference/
│   │   │   │   ├── get_model.py
│   │   │   │   ├── multi_threaded_inference.cc
│   │   │   │   └── unit_test_multi_threaded_inference.sh
│   │   │   ├── sentiment_analysis_rnn.cpp
│   │   │   ├── unit_test_imagenet_inference.sh
│   │   │   └── unit_test_sentiment_analysis_rnn.sh
│   │   ├── lenet.cpp
│   │   ├── lenet_with_mxdataiter.cpp
│   │   ├── mlp.cpp
│   │   ├── mlp_cpu.cpp
│   │   ├── mlp_csv.cpp
│   │   ├── mlp_gpu.cpp
│   │   ├── mnist_to_csv.py
│   │   ├── resnet.cpp
│   │   ├── run_lenet_with_mxdataiter.sh
│   │   ├── test_kvstore.cpp
│   │   ├── test_ndarray_copy.cpp
│   │   ├── test_optimizer.cpp
│   │   ├── test_regress_label.cpp
│   │   ├── test_score.cpp
│   │   ├── unittests/
│   │   │   └── unit_test_mlp_csv.sh
│   │   └── utils.h
│   ├── include/
│   │   └── mxnet-cpp/
│   │       ├── .gitignore
│   │       ├── CPPLINT.cfg
│   │       ├── MxNetCpp.h
│   │       ├── base.h
│   │       ├── contrib.h
│   │       ├── executor.h
│   │       ├── executor.hpp
│   │       ├── initializer.h
│   │       ├── io.h
│   │       ├── io.hpp
│   │       ├── kvstore.h
│   │       ├── kvstore.hpp
│   │       ├── lr_scheduler.h
│   │       ├── metric.h
│   │       ├── model.h
│   │       ├── ndarray.h
│   │       ├── ndarray.hpp
│   │       ├── op_map.h
│   │       ├── op_suppl.h
│   │       ├── op_util.h
│   │       ├── operator.h
│   │       ├── operator.hpp
│   │       ├── optimizer.h
│   │       ├── optimizer.hpp
│   │       ├── shape.h
│   │       ├── symbol.h
│   │       └── symbol.hpp
│   ├── scripts/
│   │   ├── OpWrapperGenerator.py
│   │   └── lint.py
│   └── tests/
│       └── ci_test.sh
├── doap.rdf
├── docker/
│   ├── .gitignore
│   ├── Dockerfiles/
│   │   ├── Dockerfile.in.julia
│   │   ├── Dockerfile.in.lib.cpu
│   │   ├── Dockerfile.in.lib.gpu
│   │   ├── Dockerfile.in.perl
│   │   ├── Dockerfile.in.python
│   │   ├── Dockerfile.in.r-lang
│   │   └── Dockerfile.in.scala
│   ├── README.md
│   ├── docker-python/
│   │   ├── README.md
│   │   ├── build_python_dockerfile.sh
│   │   └── test_mxnet.py
│   ├── install/
│   │   ├── cpp.sh
│   │   ├── julia.sh
│   │   ├── perl.sh
│   │   ├── python.sh
│   │   ├── r.sh
│   │   └── scala.sh
│   ├── run.sh
│   └── tool.sh
├── docs/
│   ├── .dockerignore
│   ├── .gitignore
│   ├── README.md
│   ├── cpp_docs/
│   │   ├── Doxyfile
│   │   └── Makefile
│   ├── python_docs/
│   │   ├── README.md
│   │   ├── _static/
│   │   │   ├── autodoc.js
│   │   │   ├── feedback.css
│   │   │   ├── matomo_analytics.js
│   │   │   └── mxnet.css
│   │   ├── python/
│   │   │   ├── .gitignore
│   │   │   ├── Makefile
│   │   │   ├── Makefile_sphinx
│   │   │   ├── api/
│   │   │   │   ├── autograd/
│   │   │   │   │   └── index.rst
│   │   │   │   ├── contrib/
│   │   │   │   │   ├── index.rst
│   │   │   │   │   ├── io/
│   │   │   │   │   │   └── index.rst
│   │   │   │   │   ├── ndarray/
│   │   │   │   │   │   └── index.rst
│   │   │   │   │   ├── onnx/
│   │   │   │   │   │   └── index.rst
│   │   │   │   │   ├── quantization/
│   │   │   │   │   │   └── index.rst
│   │   │   │   │   ├── symbol/
│   │   │   │   │   │   └── index.rst
│   │   │   │   │   ├── tensorboard/
│   │   │   │   │   │   └── index.rst
│   │   │   │   │   ├── tensorrt/
│   │   │   │   │   │   └── index.rst
│   │   │   │   │   └── text/
│   │   │   │   │       └── index.rst
│   │   │   │   ├── device/
│   │   │   │   │   └── index.rst
│   │   │   │   ├── engine/
│   │   │   │   │   └── index.rst
│   │   │   │   ├── executor/
│   │   │   │   │   └── index.rst
│   │   │   │   ├── gluon/
│   │   │   │   │   ├── block.rst
│   │   │   │   │   ├── constant.rst
│   │   │   │   │   ├── contrib/
│   │   │   │   │   │   └── index.rst
│   │   │   │   │   ├── hybrid_block.rst
│   │   │   │   │   ├── index.rst
│   │   │   │   │   ├── loss/
│   │   │   │   │   │   └── index.rst
│   │   │   │   │   ├── metric/
│   │   │   │   │   │   └── index.rst
│   │   │   │   │   ├── model_zoo/
│   │   │   │   │   │   └── index.rst
│   │   │   │   │   ├── nn/
│   │   │   │   │   │   └── index.rst
│   │   │   │   │   ├── parameter.rst
│   │   │   │   │   ├── rnn/
│   │   │   │   │   │   └── index.rst
│   │   │   │   │   ├── symbol_block.rst
│   │   │   │   │   ├── trainer.rst
│   │   │   │   │   └── utils/
│   │   │   │   │       └── index.rst
│   │   │   │   ├── index.rst
│   │   │   │   ├── initializer/
│   │   │   │   │   └── index.rst
│   │   │   │   ├── kvstore/
│   │   │   │   │   └── index.rst
│   │   │   │   ├── kvstore_server/
│   │   │   │   │   └── index.rst
│   │   │   │   ├── legacy/
│   │   │   │   │   ├── callback/
│   │   │   │   │   │   └── index.rst
│   │   │   │   │   ├── image/
│   │   │   │   │   │   └── index.rst
│   │   │   │   │   ├── index.rst
│   │   │   │   │   ├── io/
│   │   │   │   │   │   └── index.rst
│   │   │   │   │   ├── ndarray/
│   │   │   │   │   │   ├── contrib/
│   │   │   │   │   │   │   └── index.rst
│   │   │   │   │   │   ├── image/
│   │   │   │   │   │   │   └── index.rst
│   │   │   │   │   │   ├── index.rst
│   │   │   │   │   │   ├── linalg/
│   │   │   │   │   │   │   └── index.rst
│   │   │   │   │   │   ├── ndarray.rst
│   │   │   │   │   │   ├── op/
│   │   │   │   │   │   │   └── index.rst
│   │   │   │   │   │   ├── random/
│   │   │   │   │   │   │   └── index.rst
│   │   │   │   │   │   ├── register/
│   │   │   │   │   │   │   └── index.rst
│   │   │   │   │   │   ├── sparse/
│   │   │   │   │   │   │   └── index.rst
│   │   │   │   │   │   └── utils/
│   │   │   │   │   │       └── index.rst
│   │   │   │   │   ├── recordio/
│   │   │   │   │   │   └── index.rst
│   │   │   │   │   ├── symbol/
│   │   │   │   │   │   ├── contrib/
│   │   │   │   │   │   │   └── index.rst
│   │   │   │   │   │   ├── image/
│   │   │   │   │   │   │   └── index.rst
│   │   │   │   │   │   ├── index.rst
│   │   │   │   │   │   ├── linalg/
│   │   │   │   │   │   │   └── index.rst
│   │   │   │   │   │   ├── op/
│   │   │   │   │   │   │   └── index.rst
│   │   │   │   │   │   ├── random/
│   │   │   │   │   │   │   └── index.rst
│   │   │   │   │   │   ├── register/
│   │   │   │   │   │   │   └── index.rst
│   │   │   │   │   │   ├── sparse/
│   │   │   │   │   │   │   └── index.rst
│   │   │   │   │   │   └── symbol.rst
│   │   │   │   │   └── visualization/
│   │   │   │   │       └── index.rst
│   │   │   │   ├── lr_scheduler/
│   │   │   │   │   └── index.rst
│   │   │   │   ├── np/
│   │   │   │   │   ├── arrays.indexing.rst
│   │   │   │   │   ├── arrays.ndarray.rst
│   │   │   │   │   ├── arrays.rst
│   │   │   │   │   ├── index.rst
│   │   │   │   │   ├── random/
│   │   │   │   │   │   └── index.rst
│   │   │   │   │   ├── routines.array-creation.rst
│   │   │   │   │   ├── routines.array-manipulation.rst
│   │   │   │   │   ├── routines.io.rst
│   │   │   │   │   ├── routines.linalg.rst
│   │   │   │   │   ├── routines.math.rst
│   │   │   │   │   ├── routines.rst
│   │   │   │   │   ├── routines.sort.rst
│   │   │   │   │   └── routines.statistics.rst
│   │   │   │   ├── npx/
│   │   │   │   │   └── index.rst
│   │   │   │   ├── optimizer/
│   │   │   │   │   └── index.rst
│   │   │   │   ├── profiler/
│   │   │   │   │   └── index.rst
│   │   │   │   ├── rtc/
│   │   │   │   │   └── index.rst
│   │   │   │   ├── runtime/
│   │   │   │   │   └── index.rst
│   │   │   │   ├── test_utils/
│   │   │   │   │   └── index.rst
│   │   │   │   └── util/
│   │   │   │       └── index.rst
│   │   │   ├── index.rst
│   │   │   ├── scripts/
│   │   │   │   ├── conf.py
│   │   │   │   ├── md2ipynb.py
│   │   │   │   └── process_rst.py
│   │   │   └── tutorials/
│   │   │       ├── deploy/
│   │   │       │   ├── export/
│   │   │       │   │   ├── index.rst
│   │   │       │   │   └── onnx.md
│   │   │       │   ├── index.rst
│   │   │       │   ├── inference/
│   │   │       │   │   ├── cpp.rst
│   │   │       │   │   ├── image_classification_jetson.md
│   │   │       │   │   └── index.rst
│   │   │       │   └── run-on-aws/
│   │   │       │       ├── cloud.md
│   │   │       │       ├── index.rst
│   │   │       │       ├── use_ec2.rst
│   │   │       │       └── use_sagemaker.rst
│   │   │       ├── extend/
│   │   │       │   ├── customop.md
│   │   │       │   └── index.rst
│   │   │       ├── getting-started/
│   │   │       │   ├── crash-course/
│   │   │       │   │   ├── 0-introduction.md
│   │   │       │   │   ├── 1-nparray.md
│   │   │       │   │   ├── 2-create-nn.md
│   │   │       │   │   ├── 3-autograd.md
│   │   │       │   │   ├── 4-components.md
│   │   │       │   │   ├── 5-datasets.md
│   │   │       │   │   ├── 6-train-nn.md
│   │   │       │   │   ├── 7-use-gpus.md
│   │   │       │   │   ├── index.rst
│   │   │       │   │   └── prepare_dataset.py
│   │   │       │   ├── gluon_from_experiment_to_deployment.md
│   │   │       │   ├── gluon_migration_guide.md
│   │   │       │   ├── index.rst
│   │   │       │   ├── logistic_regression_explained.md
│   │   │       │   └── to-mxnet/
│   │   │       │       ├── index.rst
│   │   │       │       └── pytorch.md
│   │   │       ├── index.rst
│   │   │       ├── packages/
│   │   │       │   ├── autograd/
│   │   │       │   │   └── index.md
│   │   │       │   ├── gluon/
│   │   │       │   │   ├── blocks/
│   │   │       │   │   │   ├── activations/
│   │   │       │   │   │   │   └── activations.md
│   │   │       │   │   │   ├── custom-layer.md
│   │   │       │   │   │   ├── hybridize.md
│   │   │       │   │   │   ├── index.rst
│   │   │       │   │   │   ├── init.md
│   │   │       │   │   │   ├── naming.md
│   │   │       │   │   │   ├── nn.md
│   │   │       │   │   │   ├── parameters.md
│   │   │       │   │   │   └── save_load_params.md
│   │   │       │   │   ├── image/
│   │   │       │   │   │   ├── index.rst
│   │   │       │   │   │   ├── info_gan.md
│   │   │       │   │   │   └── mnist.md
│   │   │       │   │   ├── index.rst
│   │   │       │   │   ├── loss/
│   │   │       │   │   │   ├── custom-loss.md
│   │   │       │   │   │   ├── index.rst
│   │   │       │   │   │   ├── kl_divergence.md
│   │   │       │   │   │   └── loss.md
│   │   │       │   │   ├── text/
│   │   │       │   │   │   ├── gnmt.rst
│   │   │       │   │   │   ├── index.rst
│   │   │       │   │   │   └── transformer.rst
│   │   │       │   │   └── training/
│   │   │       │   │       ├── fit_api_tutorial.md
│   │   │       │   │       ├── index.rst
│   │   │       │   │       ├── learning_rates/
│   │   │       │   │       │   ├── index.rst
│   │   │       │   │       │   ├── learning_rate_finder.md
│   │   │       │   │       │   ├── learning_rate_schedules.md
│   │   │       │   │       │   └── learning_rate_schedules_advanced.md
│   │   │       │   │       ├── normalization/
│   │   │       │   │       │   └── index.md
│   │   │       │   │       └── trainer.md
│   │   │       │   ├── index.rst
│   │   │       │   ├── kvstore/
│   │   │       │   │   ├── index.rst
│   │   │       │   │   └── kvstore.md
│   │   │       │   ├── legacy/
│   │   │       │   │   ├── index.rst
│   │   │       │   │   └── ndarray/
│   │   │       │   │       ├── 01-ndarray-intro.md
│   │   │       │   │       ├── 02-ndarray-operations.md
│   │   │       │   │       ├── 03-ndarray-contexts.md
│   │   │       │   │       ├── gotchas_numpy_in_mxnet.md
│   │   │       │   │       ├── index.rst
│   │   │       │   │       └── sparse/
│   │   │       │   │           ├── csr.md
│   │   │       │   │           ├── index.rst
│   │   │       │   │           └── row_sparse.md
│   │   │       │   ├── np/
│   │   │       │   │   ├── cheat-sheet.md
│   │   │       │   │   ├── index.rst
│   │   │       │   │   └── np-vs-numpy.md
│   │   │       │   ├── onnx/
│   │   │       │   │   ├── fine_tuning_gluon.md
│   │   │       │   │   ├── index.rst
│   │   │       │   │   └── inference_on_onnx_model.md
│   │   │       │   ├── optimizer/
│   │   │       │   │   └── index.md
│   │   │       │   └── viz/
│   │   │       │       └── index.rst
│   │   │       └── performance/
│   │   │           ├── backend/
│   │   │           │   ├── amp.md
│   │   │           │   ├── dnnl/
│   │   │           │   │   ├── dnnl_quantization.md
│   │   │           │   │   ├── dnnl_quantization_inc.md
│   │   │           │   │   ├── dnnl_readme.md
│   │   │           │   │   └── index.rst
│   │   │           │   ├── index.rst
│   │   │           │   ├── profiler.md
│   │   │           │   └── tvm.rst
│   │   │           ├── compression/
│   │   │           │   ├── index.rst
│   │   │           │   └── int8.rst
│   │   │           └── index.rst
│   │   ├── requirements
│   │   └── themes/
│   │       ├── .babelrc
│   │       ├── .circleci/
│   │       │   └── config.yml
│   │       ├── .gitignore
│   │       ├── .sassrc
│   │       └── mx-theme/
│   │           ├── LICENSE
│   │           ├── MANIFEST.in
│   │           ├── README.md
│   │           ├── mxtheme/
│   │           │   ├── __init__.py
│   │           │   ├── card.py
│   │           │   ├── drawer.html
│   │           │   ├── feedback.html
│   │           │   ├── footer.html
│   │           │   ├── header.html
│   │           │   ├── header_search.html
│   │           │   ├── header_sourcelink.html
│   │           │   ├── header_top.html
│   │           │   ├── layout.html
│   │           │   ├── localtoc.html
│   │           │   ├── relations.html
│   │           │   ├── search.html
│   │           │   ├── static/
│   │           │   │   ├── fontawesome/
│   │           │   │   │   └── all.css
│   │           │   │   ├── fonts.css
│   │           │   │   ├── sphinx_materialdesign_theme.css
│   │           │   │   └── sphinx_materialdesign_theme.js
│   │           │   └── theme.conf
│   │           ├── setup.py
│   │           └── src/
│   │               ├── js/
│   │               │   ├── adjust-height.js
│   │               │   ├── feedback.js
│   │               │   ├── scrollspy.js
│   │               │   └── sphinx_materialdesign_theme.js
│   │               └── scss/
│   │                   ├── _root.scss
│   │                   ├── _variables.scss
│   │                   ├── admonitions/
│   │                   │   └── _admonitions.scss
│   │                   ├── blockquote/
│   │                   │   └── _blockquote.scss
│   │                   ├── card/
│   │                   │   └── _card.scss
│   │                   ├── code/
│   │                   │   └── _code.scss
│   │                   ├── downloadlink/
│   │                   │   └── _downloadlink.scss
│   │                   ├── drawer/
│   │                   │   └── _drawer.scss
│   │                   ├── fonts/
│   │                   │   └── _material-icons.scss
│   │                   ├── footer/
│   │                   │   └── _footer.scss
│   │                   ├── grid/
│   │                   │   └── _simplegrid.scss
│   │                   ├── header/
│   │                   │   └── _header.scss
│   │                   ├── headerings/
│   │                   │   └── _headerings.scss
│   │                   ├── layout/
│   │                   │   └── _layout.scss
│   │                   ├── lists/
│   │                   │   └── _lists.scss
│   │                   ├── search/
│   │                   │   └── _search.scss
│   │                   ├── sphinx_materialdesign_theme.scss
│   │                   ├── tables/
│   │                   │   └── _tables.scss
│   │                   └── toc/
│   │                       ├── _globaltoc.scss
│   │                       ├── _localtoc.scss
│   │                       └── _toctree.scss
│   ├── static_site/
│   │   ├── .gitignore
│   │   ├── .nojekyll
│   │   ├── Makefile
│   │   ├── README.md
│   │   └── src/
│   │       ├── .asf.yaml
│   │       ├── .gitignore
│   │       ├── .htaccess
│   │       ├── .nojekyll
│   │       ├── 404.html
│   │       ├── Gemfile
│   │       ├── _config.yml
│   │       ├── _config_beta.yml
│   │       ├── _config_prod.yml
│   │       ├── _includes/
│   │       │   ├── callout.html
│   │       │   ├── disqus_comments.html
│   │       │   ├── feedback.html
│   │       │   ├── footer.html
│   │       │   ├── get_started/
│   │       │   │   ├── cloud/
│   │       │   │   │   ├── cpu.md
│   │       │   │   │   └── gpu.md
│   │       │   │   ├── devices/
│   │       │   │   │   ├── nvidia-jetson.md
│   │       │   │   │   └── raspberry_pi.md
│   │       │   │   ├── get_started.html
│   │       │   │   ├── gpu_snippet.md
│   │       │   │   ├── linux/
│   │       │   │   │   ├── clojure/
│   │       │   │   │   │   └── build-from-source.md
│   │       │   │   │   ├── cpp/
│   │       │   │   │   │   └── build-from-source.md
│   │       │   │   │   ├── java/
│   │       │   │   │   │   └── build-from-source.md
│   │       │   │   │   ├── julia/
│   │       │   │   │   │   └── build-from-source.md
│   │       │   │   │   ├── perl/
│   │       │   │   │   │   └── build-from-source.md
│   │       │   │   │   ├── python/
│   │       │   │   │   │   ├── cpu/
│   │       │   │   │   │   │   ├── build-from-source.md
│   │       │   │   │   │   │   ├── docker.md
│   │       │   │   │   │   │   └── pip.md
│   │       │   │   │   │   └── gpu/
│   │       │   │   │   │       ├── build-from-source.md
│   │       │   │   │   │       ├── docker.md
│   │       │   │   │   │       └── pip.md
│   │       │   │   │   ├── r/
│   │       │   │   │   │   └── build-from-source.md
│   │       │   │   │   └── scala/
│   │       │   │   │       └── build-from-source.md
│   │       │   │   └── pip_snippet.md
│   │       │   ├── head.html
│   │       │   ├── header.html
│   │       │   ├── icon-github.html
│   │       │   ├── icon-twitter.html
│   │       │   ├── important.html
│   │       │   ├── matomo-analytics.html
│   │       │   ├── note.html
│   │       │   ├── social.html
│   │       │   ├── tip.html
│   │       │   └── warning.html
│   │       ├── _layouts/
│   │       │   ├── default.html
│   │       │   ├── home.html
│   │       │   ├── page.html
│   │       │   ├── page_api.html
│   │       │   ├── page_category.html
│   │       │   ├── page_landing_tutorials.html
│   │       │   └── post.html
│   │       ├── _plugins/
│   │       │   └── markdowner.rb
│   │       ├── _sass/
│   │       │   ├── feedback.scss
│   │       │   ├── generalVersionDropdown.scss
│   │       │   ├── globalSearch.scss
│   │       │   ├── minima/
│   │       │   │   ├── _base.scss
│   │       │   │   ├── _blog.scss
│   │       │   │   ├── _docs.scss
│   │       │   │   ├── _ecosystem.scss
│   │       │   │   ├── _features.scss
│   │       │   │   ├── _getting_started.scss
│   │       │   │   ├── _home.scss
│   │       │   │   ├── _layout.scss
│   │       │   │   ├── _syntax-highlighting.scss
│   │       │   │   ├── colorful.scss
│   │       │   │   └── simple-grid.scss
│   │       │   └── minima.scss
│   │       ├── assets/
│   │       │   ├── js/
│   │       │   │   ├── clipboard.js
│   │       │   │   ├── copycode.js
│   │       │   │   ├── feedback.js
│   │       │   │   ├── globalSearch.js
│   │       │   │   └── options.js
│   │       │   └── main.scss
│   │       ├── index.html
│   │       └── pages/
│   │           ├── api/
│   │           │   ├── api.html
│   │           │   ├── architecture/
│   │           │   │   ├── exception_handling.md
│   │           │   │   ├── note_data_loading.md
│   │           │   │   ├── note_engine.md
│   │           │   │   ├── note_memory.md
│   │           │   │   ├── overview.md
│   │           │   │   └── program_model.md
│   │           │   ├── clojure/
│   │           │   │   ├── docs/
│   │           │   │   │   └── tutorials/
│   │           │   │   │       ├── index.md
│   │           │   │   │       ├── kvstore.md
│   │           │   │   │       ├── module.md
│   │           │   │   │       ├── ndarray.md
│   │           │   │   │       ├── symbol.md
│   │           │   │   │       └── symbol_in_pictures.md
│   │           │   │   └── index.md
│   │           │   ├── cpp/
│   │           │   │   ├── docs/
│   │           │   │   │   └── tutorials/
│   │           │   │   │       ├── basics.md
│   │           │   │   │       ├── index.md
│   │           │   │   │       ├── multi_threaded_inference.md
│   │           │   │   │       ├── mxnet_cpp_inference_tutorial.md
│   │           │   │   │       └── subgraphAPI.md
│   │           │   │   └── index.md
│   │           │   ├── developer_guide/
│   │           │   │   ├── 1_github_contribution_and_PR_verification_tips.md
│   │           │   │   ├── debugging_and_performance_optimization_tips.md
│   │           │   │   ├── examine_forward_results_with_hooks.md
│   │           │   │   ├── exception_handing_and_custom_error_types.md
│   │           │   │   └── profiling.md
│   │           │   ├── faq/
│   │           │   │   ├── add_op_in_backend.md
│   │           │   │   ├── cloud.md
│   │           │   │   ├── distributed_training.md
│   │           │   │   ├── env_var.md
│   │           │   │   ├── float16.md
│   │           │   │   ├── gradient_compression.md
│   │           │   │   ├── large_tensor_support.md
│   │           │   │   ├── model_parallel_lstm.md
│   │           │   │   ├── new_op.md
│   │           │   │   ├── perf.md
│   │           │   │   ├── recordio.md
│   │           │   │   ├── s3_integration.md
│   │           │   │   ├── security.md
│   │           │   │   ├── tensor_inspector_tutorial.md
│   │           │   │   ├── using_rtc.md
│   │           │   │   └── why_mxnet.md
│   │           │   ├── java/
│   │           │   │   ├── docs/
│   │           │   │   │   └── tutorials/
│   │           │   │   │       ├── index.md
│   │           │   │   │       └── ssd_inference.md
│   │           │   │   └── index.md
│   │           │   ├── julia/
│   │           │   │   └── index.md
│   │           │   ├── perl/
│   │           │   │   ├── docs/
│   │           │   │   │   └── tutorials/
│   │           │   │   │       ├── index.md
│   │           │   │   │       ├── io.md
│   │           │   │   │       ├── kvstore.md
│   │           │   │   │       ├── ndarray.md
│   │           │   │   │       └── symbol.md
│   │           │   │   └── index.md
│   │           │   ├── python/
│   │           │   │   └── index.md
│   │           │   ├── r/
│   │           │   │   ├── docs/
│   │           │   │   │   └── tutorials/
│   │           │   │   │       ├── char_rnn_model.md
│   │           │   │   │       ├── classify_real_image_with_pretrained_model.md
│   │           │   │   │       ├── custom_iterator.md
│   │           │   │   │       ├── index.md
│   │           │   │   │       ├── multi_dim_lstm.md
│   │           │   │   │       ├── ndarray.md
│   │           │   │   │       └── symbol.md
│   │           │   │   └── index.md
│   │           │   └── scala/
│   │           │       ├── docs/
│   │           │       │   └── tutorials/
│   │           │       │       ├── index.md
│   │           │       │       ├── infer.md
│   │           │       │       ├── io.md
│   │           │       │       ├── kvstore.md
│   │           │       │       ├── ndarray.md
│   │           │       │       ├── symbol.md
│   │           │       │       └── symbol_in_pictures.md
│   │           │       └── index.md
│   │           ├── community/
│   │           │   ├── clang_format_guide.md
│   │           │   ├── code_guide.md
│   │           │   ├── code_review.md
│   │           │   ├── committer_guide.md
│   │           │   ├── community.md
│   │           │   ├── document.md
│   │           │   ├── error_handling.md
│   │           │   ├── git_howto.md
│   │           │   ├── index.md
│   │           │   └── pull_request.md
│   │           ├── ecosystem.html
│   │           ├── features.html
│   │           ├── get_started/
│   │           │   ├── build_from_source.md
│   │           │   ├── download.md
│   │           │   ├── index.html
│   │           │   ├── jetson_setup.md
│   │           │   └── validate_mxnet.md
│   │           └── trusted_by.html
│   └── tutorial_utils/
│       └── vision/
│           └── cnn_visualization/
│               └── gradcam.py
├── example/
│   ├── MXNetTutorialTemplate.ipynb
│   ├── README.md
│   ├── adversary/
│   │   ├── README.md
│   │   └── adversary_generation.ipynb
│   ├── bi-lstm-sort/
│   │   ├── README.md
│   │   └── bi-lstm-sort.ipynb
│   ├── distributed_training/
│   │   ├── README.md
│   │   ├── cifar10_dist.py
│   │   └── cifar10_kvstore_hvd.py
│   ├── distributed_training-horovod/
│   │   ├── README.md
│   │   ├── gluon_mnist.py
│   │   └── resnet50_imagenet.py
│   ├── extensions/
│   │   ├── lib_api/
│   │   │   ├── Makefile
│   │   │   ├── init_lib.cc
│   │   │   ├── libtest.cc
│   │   │   └── test_loading.py
│   │   ├── lib_custom_op/
│   │   │   ├── Makefile
│   │   │   ├── README.md
│   │   │   ├── gemm_lib.cc
│   │   │   ├── relu_lib.cc
│   │   │   ├── relu_lib.cu
│   │   │   ├── relu_lib.h
│   │   │   ├── test_gemm.py
│   │   │   ├── test_relu.py
│   │   │   ├── test_transposecsr.py
│   │   │   ├── test_transposerowsp.py
│   │   │   ├── transposecsr_lib.cc
│   │   │   └── transposerowsp_lib.cc
│   │   ├── lib_external_ops/
│   │   │   ├── CMakeLists.txt
│   │   │   ├── README.md
│   │   │   ├── init_lib.cc
│   │   │   ├── min_ex-inl.h
│   │   │   ├── min_ex.cc
│   │   │   ├── min_ex.cu
│   │   │   └── test_loading.py
│   │   ├── lib_pass/
│   │   │   ├── Makefile
│   │   │   ├── README.md
│   │   │   ├── pass_lib.cc
│   │   │   └── test_pass.py
│   │   └── lib_subgraph/
│   │       ├── Makefile
│   │       ├── README.md
│   │       ├── subgraph_lib.cc
│   │       └── test_subgraph.py
│   ├── gluon/
│   │   ├── actor_critic/
│   │   │   ├── README.md
│   │   │   └── actor_critic.py
│   │   ├── data.py
│   │   ├── house_prices/
│   │   │   ├── README.md
│   │   │   └── kaggle_k_fold_cross_validation.py
│   │   ├── image_classification.py
│   │   ├── mnist/
│   │   │   ├── README.md
│   │   │   └── mnist.py
│   │   └── super_resolution/
│   │       ├── README.md
│   │       └── super_resolution.py
│   ├── multi-task/
│   │   ├── README.md
│   │   └── multi-task-learning.ipynb
│   ├── probability/
│   │   └── VAE/
│   │       └── VAE.md
│   ├── profiler/
│   │   ├── README.md
│   │   ├── profiler_imageiter.py
│   │   ├── profiler_matmul.py
│   │   └── profiler_ndarray.py
│   ├── quantization/
│   │   ├── README.md
│   │   ├── imagenet_gen_qsym_onednn.py
│   │   ├── imagenet_inference.py
│   │   └── launch_inference_onednn.sh
│   ├── quantization_inc/
│   │   ├── custom_strategy.py
│   │   ├── resnet50v2_mse.yaml
│   │   ├── resnet_measurement.py
│   │   ├── resnet_mse.py
│   │   └── resnet_tuning.py
│   └── recommenders/
│       ├── .gitignore
│       ├── README.md
│       ├── demo1-MF.ipynb
│       ├── demo2-dssm.ipynb
│       ├── matrix_fact.py
│       └── movielens_data.py
├── include/
│   └── mxnet/
│       ├── api_registry.h
│       ├── base.h
│       ├── c_api.h
│       ├── c_api_error.h
│       ├── c_api_test.h
│       ├── engine.h
│       ├── executor.h
│       ├── expr_operator.h
│       ├── graph_attr_types.h
│       ├── imperative.h
│       ├── io.h
│       ├── ir/
│       │   └── expr.h
│       ├── kvstore.h
│       ├── lib_api.h
│       ├── libinfo.h
│       ├── ndarray.h
│       ├── node/
│       │   ├── container.h
│       │   └── node.h
│       ├── op_attr_types.h
│       ├── operator.h
│       ├── operator_util.h
│       ├── random_generator.h
│       ├── resource.h
│       ├── rtc.h
│       ├── runtime/
│       │   ├── c_runtime_api.h
│       │   ├── container.h
│       │   ├── container_ext.h
│       │   ├── data_type.h
│       │   ├── ffi_helper.h
│       │   ├── memory.h
│       │   ├── ndarray.h
│       │   ├── ndarray_handle.h
│       │   ├── object.h
│       │   ├── packed_func.h
│       │   ├── py_arg.h
│       │   └── registry.h
│       ├── storage.h
│       ├── tensor_blob.h
│       └── tuple.h
├── licenses/
│   ├── BOOST1_0
│   ├── BSD2
│   ├── BSD3-cmake
│   ├── MIT
│   └── OFL1_1
├── plugin/
│   ├── opencv/
│   │   ├── __init__.py
│   │   ├── cv_api.cc
│   │   ├── cv_api.h
│   │   ├── opencv.mk
│   │   └── opencv.py
│   ├── sframe/
│   │   ├── iter_sframe.cc
│   │   └── plugin.mk
│   ├── torch/
│   │   ├── torch.mk
│   │   ├── torch_base.cc
│   │   ├── torch_base.h
│   │   ├── torch_criterion-inl.h
│   │   ├── torch_criterion.cc
│   │   ├── torch_criterion.cu
│   │   ├── torch_function.cc
│   │   ├── torch_function.h
│   │   ├── torch_module-inl.h
│   │   ├── torch_module.cc
│   │   └── torch_module.cu
│   └── warpctc/
│       ├── warpctc-inl.h
│       ├── warpctc.cc
│       ├── warpctc.cu
│       └── warpctc.mk
├── prospector.yaml
├── pytest.ini
├── python/
│   ├── .gitignore
│   ├── README.md
│   ├── mxnet/
│   │   ├── __init__.py
│   │   ├── _api_internal.py
│   │   ├── _ctypes/
│   │   │   ├── __init__.py
│   │   │   ├── _api_internal.py
│   │   │   ├── cached_op.py
│   │   │   ├── ndarray.py
│   │   │   ├── space.py
│   │   │   └── symbol.py
│   │   ├── _cy3/
│   │   │   ├── README.md
│   │   │   └── __init__.py
│   │   ├── _deferred_compute.py
│   │   ├── _ffi/
│   │   │   ├── __init__.py
│   │   │   ├── _ctypes/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── function.py
│   │   │   │   ├── object.py
│   │   │   │   └── types.py
│   │   │   ├── _cy3/
│   │   │   │   └── __init__.py
│   │   │   ├── _cython/
│   │   │   │   ├── base.pxi
│   │   │   │   ├── core.pyx
│   │   │   │   ├── function.pxi
│   │   │   │   ├── ndarray.pxi
│   │   │   │   └── object.pxi
│   │   │   ├── base.py
│   │   │   ├── function.py
│   │   │   ├── node_generic.py
│   │   │   ├── object.py
│   │   │   └── runtime_ctypes.py
│   │   ├── _global_var.py
│   │   ├── _numpy_op_doc.py
│   │   ├── amp/
│   │   │   ├── __init__.py
│   │   │   ├── amp.py
│   │   │   ├── lists/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── symbol_bf16.py
│   │   │   │   └── symbol_fp16.py
│   │   │   └── loss_scaler.py
│   │   ├── api.py
│   │   ├── attribute.py
│   │   ├── autograd.py
│   │   ├── base.py
│   │   ├── callback.py
│   │   ├── container.py
│   │   ├── context.py
│   │   ├── contrib/
│   │   │   ├── __init__.py
│   │   │   ├── io.py
│   │   │   ├── ndarray.py
│   │   │   ├── onnx/
│   │   │   │   └── __init__.py
│   │   │   ├── quantization.py
│   │   │   ├── symbol.py
│   │   │   ├── tensorboard.py
│   │   │   ├── tensorrt.py
│   │   │   └── text/
│   │   │       ├── __init__.py
│   │   │       ├── _constants.py
│   │   │       ├── embedding.py
│   │   │       ├── utils.py
│   │   │       └── vocab.py
│   │   ├── cuda/
│   │   │   ├── __init__.py
│   │   │   └── nvtx.py
│   │   ├── cython/
│   │   │   ├── __init__.py
│   │   │   ├── base.pyi
│   │   │   ├── ndarray.pyx
│   │   │   └── symbol.pyx
│   │   ├── device.py
│   │   ├── dlpack.py
│   │   ├── engine.py
│   │   ├── error.py
│   │   ├── executor.py
│   │   ├── gluon/
│   │   │   ├── .gitignore
│   │   │   ├── __init__.py
│   │   │   ├── block.py
│   │   │   ├── contrib/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── data/
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── _constants.py
│   │   │   │   │   └── vision/
│   │   │   │   │       ├── __init__.py
│   │   │   │   │       ├── dataloader.py
│   │   │   │   │       └── transforms/
│   │   │   │   │           ├── __init__.py
│   │   │   │   │           └── bbox/
│   │   │   │   │               ├── __init__.py
│   │   │   │   │               ├── bbox.py
│   │   │   │   │               └── utils.py
│   │   │   │   └── estimator/
│   │   │   │       ├── __init__.py
│   │   │   │       ├── batch_processor.py
│   │   │   │       ├── estimator.py
│   │   │   │       ├── event_handler.py
│   │   │   │       └── utils.py
│   │   │   ├── data/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── _internal.py
│   │   │   │   ├── batchify.py
│   │   │   │   ├── dataloader.py
│   │   │   │   ├── dataset.py
│   │   │   │   ├── sampler.py
│   │   │   │   └── vision/
│   │   │   │       ├── __init__.py
│   │   │   │       ├── datasets.py
│   │   │   │       └── transforms/
│   │   │   │           ├── __init__.py
│   │   │   │           └── image.py
│   │   │   ├── loss.py
│   │   │   ├── metric.py
│   │   │   ├── model_zoo/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── model_store.py
│   │   │   │   └── vision/
│   │   │   │       ├── __init__.py
│   │   │   │       ├── alexnet.py
│   │   │   │       ├── densenet.py
│   │   │   │       ├── inception.py
│   │   │   │       ├── mobilenet.py
│   │   │   │       ├── resnet.py
│   │   │   │       ├── squeezenet.py
│   │   │   │       └── vgg.py
│   │   │   ├── nn/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── activations.py
│   │   │   │   ├── basic_layers.py
│   │   │   │   └── conv_layers.py
│   │   │   ├── parameter.py
│   │   │   ├── probability/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── block/
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   └── stochastic_block.py
│   │   │   │   ├── distributions/
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── bernoulli.py
│   │   │   │   │   ├── beta.py
│   │   │   │   │   ├── binomial.py
│   │   │   │   │   ├── categorical.py
│   │   │   │   │   ├── cauchy.py
│   │   │   │   │   ├── chi2.py
│   │   │   │   │   ├── constraint.py
│   │   │   │   │   ├── dirichlet.py
│   │   │   │   │   ├── distribution.py
│   │   │   │   │   ├── divergence.py
│   │   │   │   │   ├── exp_family.py
│   │   │   │   │   ├── exponential.py
│   │   │   │   │   ├── fishersnedecor.py
│   │   │   │   │   ├── gamma.py
│   │   │   │   │   ├── geometric.py
│   │   │   │   │   ├── gumbel.py
│   │   │   │   │   ├── half_cauchy.py
│   │   │   │   │   ├── half_normal.py
│   │   │   │   │   ├── independent.py
│   │   │   │   │   ├── laplace.py
│   │   │   │   │   ├── multinomial.py
│   │   │   │   │   ├── multivariate_normal.py
│   │   │   │   │   ├── negative_binomial.py
│   │   │   │   │   ├── normal.py
│   │   │   │   │   ├── one_hot_categorical.py
│   │   │   │   │   ├── pareto.py
│   │   │   │   │   ├── poisson.py
│   │   │   │   │   ├── relaxed_bernoulli.py
│   │   │   │   │   ├── relaxed_one_hot_categorical.py
│   │   │   │   │   ├── studentT.py
│   │   │   │   │   ├── transformed_distribution.py
│   │   │   │   │   ├── uniform.py
│   │   │   │   │   ├── utils.py
│   │   │   │   │   └── weibull.py
│   │   │   │   └── transformation/
│   │   │   │       ├── __init__.py
│   │   │   │       ├── domain_map.py
│   │   │   │       └── transformation.py
│   │   │   ├── rnn/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── conv_rnn_cell.py
│   │   │   │   ├── rnn_cell.py
│   │   │   │   └── rnn_layer.py
│   │   │   ├── trainer.py
│   │   │   └── utils.py
│   │   ├── image/
│   │   │   ├── __init__.py
│   │   │   ├── detection.py
│   │   │   └── image.py
│   │   ├── initializer.py
│   │   ├── io/
│   │   │   ├── __init__.py
│   │   │   ├── io.py
│   │   │   └── utils.py
│   │   ├── kvstore/
│   │   │   ├── __init__.py
│   │   │   ├── base.py
│   │   │   ├── byteps.py
│   │   │   ├── horovod.py
│   │   │   ├── kvstore.py
│   │   │   └── kvstore_server.py
│   │   ├── libinfo.py
│   │   ├── library.py
│   │   ├── log.py
│   │   ├── lr_scheduler.py
│   │   ├── misc.py
│   │   ├── model.py
│   │   ├── name.py
│   │   ├── ndarray/
│   │   │   ├── __init__.py
│   │   │   ├── _internal.py
│   │   │   ├── contrib.py
│   │   │   ├── image.py
│   │   │   ├── linalg.py
│   │   │   ├── ndarray.py
│   │   │   ├── numpy/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── _api_internal.py
│   │   │   │   ├── _internal.py
│   │   │   │   ├── _op.py
│   │   │   │   ├── _register.py
│   │   │   │   ├── linalg.py
│   │   │   │   └── random.py
│   │   │   ├── numpy_extension/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── _api_internal.py
│   │   │   │   ├── _op.py
│   │   │   │   ├── _register.py
│   │   │   │   ├── control_flow.py
│   │   │   │   ├── image.py
│   │   │   │   └── random.py
│   │   │   ├── op.py
│   │   │   ├── random.py
│   │   │   ├── register.py
│   │   │   ├── sparse.py
│   │   │   └── utils.py
│   │   ├── ndarray_doc.py
│   │   ├── notebook/
│   │   │   ├── __init__.py
│   │   │   └── callback.py
│   │   ├── numpy/
│   │   │   ├── __init__.py
│   │   │   ├── _op.py
│   │   │   ├── _register.py
│   │   │   ├── arrayprint.py
│   │   │   ├── fallback.py
│   │   │   ├── fallback_linalg.py
│   │   │   ├── function_base.py
│   │   │   ├── io.py
│   │   │   ├── linalg.py
│   │   │   ├── multiarray.py
│   │   │   ├── random.py
│   │   │   ├── set_functions.py
│   │   │   ├── stride_tricks.py
│   │   │   ├── type_functions.py
│   │   │   └── utils.py
│   │   ├── numpy_dispatch_protocol.py
│   │   ├── numpy_extension/
│   │   │   ├── __init__.py
│   │   │   ├── _op.py
│   │   │   ├── _register.py
│   │   │   ├── control_flow.py
│   │   │   ├── image.py
│   │   │   ├── random.py
│   │   │   └── utils.py
│   │   ├── numpy_op_fallback.py
│   │   ├── numpy_op_signature.py
│   │   ├── onnx/
│   │   │   ├── README.md
│   │   │   ├── __init__.py
│   │   │   ├── mx2onnx/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── _export_helper.py
│   │   │   │   ├── _export_model.py
│   │   │   │   ├── _export_onnx.py
│   │   │   │   └── _op_translations/
│   │   │   │       ├── __init__.py
│   │   │   │       ├── _op_translations_opset12.py
│   │   │   │       └── _op_translations_opset13.py
│   │   │   └── setup.py
│   │   ├── operator.py
│   │   ├── optimizer/
│   │   │   ├── __init__.py
│   │   │   ├── adabelief.py
│   │   │   ├── adadelta.py
│   │   │   ├── adagrad.py
│   │   │   ├── adam.py
│   │   │   ├── adamW.py
│   │   │   ├── adamax.py
│   │   │   ├── contrib.py
│   │   │   ├── dcasgd.py
│   │   │   ├── ftml.py
│   │   │   ├── ftrl.py
│   │   │   ├── lamb.py
│   │   │   ├── lans.py
│   │   │   ├── lars.py
│   │   │   ├── nadam.py
│   │   │   ├── nag.py
│   │   │   ├── optimizer.py
│   │   │   ├── rmsprop.py
│   │   │   ├── sgd.py
│   │   │   ├── sgld.py
│   │   │   ├── signum.py
│   │   │   ├── updater.py
│   │   │   └── utils.py
│   │   ├── profiler.py
│   │   ├── random.py
│   │   ├── recordio.py
│   │   ├── registry.py
│   │   ├── rtc.py
│   │   ├── runtime.py
│   │   ├── symbol/
│   │   │   ├── __init__.py
│   │   │   ├── _internal.py
│   │   │   ├── contrib.py
│   │   │   ├── image.py
│   │   │   ├── linalg.py
│   │   │   ├── numpy/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── _internal.py
│   │   │   │   ├── _op.py
│   │   │   │   ├── _register.py
│   │   │   │   ├── _symbol.py
│   │   │   │   ├── linalg.py
│   │   │   │   └── random.py
│   │   │   ├── numpy_extension/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── _op.py
│   │   │   │   ├── _register.py
│   │   │   │   ├── image.py
│   │   │   │   └── random.py
│   │   │   ├── op.py
│   │   │   ├── random.py
│   │   │   ├── register.py
│   │   │   ├── sparse.py
│   │   │   └── symbol.py
│   │   ├── symbol_doc.py
│   │   ├── test_utils.py
│   │   ├── tvmop.py
│   │   ├── util.py
│   │   └── visualization.py
│   └── setup.py
├── rat-excludes
├── readthedocs.yml
├── snap.python
├── src/
│   ├── api/
│   │   ├── _api_internal/
│   │   │   └── _api_internal.cc
│   │   ├── cached_op_api.cc
│   │   └── operator/
│   │       ├── numpy/
│   │       │   ├── linalg/
│   │       │   │   ├── np_det.cc
│   │       │   │   ├── np_eig.cc
│   │       │   │   ├── np_eigvals.cc
│   │       │   │   ├── np_gesvd.cc
│   │       │   │   ├── np_inv.cc
│   │       │   │   ├── np_lstsq.cc
│   │       │   │   ├── np_matrix_rank.cc
│   │       │   │   ├── np_norm.cc
│   │       │   │   ├── np_pinv.cc
│   │       │   │   ├── np_potrf.cc
│   │       │   │   ├── np_qr.cc
│   │       │   │   ├── np_slogdet.cc
│   │       │   │   ├── np_solve.cc
│   │       │   │   ├── np_tensorinv.cc
│   │       │   │   └── np_tensorsolve.cc
│   │       │   ├── np_bincount_op.cc
│   │       │   ├── np_broadcast_reduce_op_boolean.cc
│   │       │   ├── np_broadcast_reduce_op_index.cc
│   │       │   ├── np_broadcast_reduce_op_value.cc
│   │       │   ├── np_cross.cc
│   │       │   ├── np_cumsum.cc
│   │       │   ├── np_delete_op.cc
│   │       │   ├── np_diff_op.cc
│   │       │   ├── np_dot_op.cc
│   │       │   ├── np_ediff1d_op.cc
│   │       │   ├── np_einsum_op.cc
│   │       │   ├── np_elemwise_broadcast_logic_op.cc
│   │       │   ├── np_elemwise_broadcast_op.cc
│   │       │   ├── np_elemwise_broadcast_op_extended_sec.cc
│   │       │   ├── np_elemwise_unary_op_basic.cc
│   │       │   ├── np_fill_diagonal_op.cc
│   │       │   ├── np_histogram_op.cc
│   │       │   ├── np_init_op.cc
│   │       │   ├── np_insert_op.cc
│   │       │   ├── np_interp_op.cc
│   │       │   ├── np_kron.cc
│   │       │   ├── np_matmul_op.cc
│   │       │   ├── np_matrix_op.cc
│   │       │   ├── np_memory_op.cc
│   │       │   ├── np_moments_op.cc
│   │       │   ├── np_nan_to_num_op.cc
│   │       │   ├── np_nonzero_op.cc
│   │       │   ├── np_ordering_op.cc
│   │       │   ├── np_pad_op.cc
│   │       │   ├── np_percentile_op.cc
│   │       │   ├── np_polynomial_op.cc
│   │       │   ├── np_repeat_op.cc
│   │       │   ├── np_tensordot_op.cc
│   │       │   ├── np_trace_op.cc
│   │       │   ├── np_tri_op.cc
│   │       │   ├── np_tril_op.cc
│   │       │   ├── np_triu_op.cc
│   │       │   ├── np_unique_op.cc
│   │       │   ├── np_where_op.cc
│   │       │   ├── np_window_op.cc
│   │       │   └── random/
│   │       │       ├── np_choice_op.cc
│   │       │       ├── np_exponential_op.cc
│   │       │       ├── np_laplace_op.cc
│   │       │       ├── np_location_scale_op.cc
│   │       │       ├── np_multinomial_op.cc
│   │       │       ├── np_pareto_op.cc
│   │       │       ├── np_power_op.cc
│   │       │       ├── np_rayleigh_op.cc
│   │       │       └── np_weibull_op.cc
│   │       ├── numpy_extension/
│   │       │   ├── npx_activation_op.cc
│   │       │   ├── npx_arange_like_op.cc
│   │       │   ├── npx_batch_dot_op.cc
│   │       │   ├── npx_batch_norm_op.cc
│   │       │   ├── npx_broadcast_like_op.cc
│   │       │   ├── npx_control_flow_op.cc
│   │       │   ├── npx_convolution_op.cc
│   │       │   ├── npx_deconvolution_op.cc
│   │       │   ├── npx_dropout_op.cc
│   │       │   ├── npx_embedding_op.cc
│   │       │   ├── npx_fully_connected_op.cc
│   │       │   ├── npx_group_norm_op.cc
│   │       │   ├── npx_layer_norm_op.cc
│   │       │   ├── npx_leaky_relu_op.cc
│   │       │   ├── npx_one_hot_op.cc
│   │       │   ├── npx_pick_op.cc
│   │       │   ├── npx_pooling_op.cc
│   │       │   ├── npx_rnn_op.cc
│   │       │   ├── npx_softmax_op.cc
│   │       │   └── npx_topk_op.cc
│   │       ├── op_utils.cc
│   │       ├── op_utils.h
│   │       ├── random/
│   │       │   ├── np_gamma_op.cc
│   │       │   ├── np_normal_op.cc
│   │       │   ├── np_randint_op.cc
│   │       │   ├── np_uniform_op.cc
│   │       │   └── shuffle_op.cc
│   │       ├── tensor/
│   │       │   ├── elemwise_binary_broadcast_op_extended.cc
│   │       │   ├── indexing_op.cc
│   │       │   ├── matrix_op.cc
│   │       │   └── unravel.cc
│   │       ├── ufunc_helper.cc
│   │       ├── ufunc_helper.h
│   │       ├── utils.cc
│   │       └── utils.h
│   ├── base.cc
│   ├── c_api/
│   │   ├── .clang-tidy
│   │   ├── c_api.cc
│   │   ├── c_api_common.h
│   │   ├── c_api_function.cc
│   │   ├── c_api_ndarray.cc
│   │   ├── c_api_profile.cc
│   │   ├── c_api_symbolic.cc
│   │   └── c_api_test.cc
│   ├── common/
│   │   ├── alm.cc
│   │   ├── alm.h
│   │   ├── cuda/
│   │   │   ├── cudnn_cxx.cc
│   │   │   ├── cudnn_cxx.h
│   │   │   ├── nvtx.h
│   │   │   ├── rtc/
│   │   │   │   ├── backward_functions-inl.h
│   │   │   │   ├── forward_functions-inl.h
│   │   │   │   ├── half-inl.h
│   │   │   │   ├── reducer-inl.h
│   │   │   │   ├── special_functions-inl.h
│   │   │   │   ├── util-inl.h
│   │   │   │   └── vectorization-inl.h
│   │   │   ├── rtc.cc
│   │   │   ├── rtc.h
│   │   │   ├── utils.cc
│   │   │   └── utils.h
│   │   ├── exec_utils.cc
│   │   ├── exec_utils.h
│   │   ├── lazy_alloc_array.h
│   │   ├── object_pool.h
│   │   ├── random_generator.cu
│   │   ├── rtc.cc
│   │   ├── static_array.h
│   │   ├── tensor_inspector.h
│   │   ├── utils.cc
│   │   ├── utils.cu
│   │   └── utils.h
│   ├── engine/
│   │   ├── engine.cc
│   │   ├── engine_impl.h
│   │   ├── naive_engine.cc
│   │   ├── openmp.cc
│   │   ├── openmp.h
│   │   ├── stream_manager.h
│   │   ├── thread_pool.h
│   │   ├── threaded_engine.cc
│   │   ├── threaded_engine.h
│   │   ├── threaded_engine_perdevice.cc
│   │   └── threaded_engine_pooled.cc
│   ├── imperative/
│   │   ├── attach_op_execs_pass.cc
│   │   ├── attach_op_resource_pass.cc
│   │   ├── cached_op.cc
│   │   ├── cached_op.h
│   │   ├── cached_op_threadsafe.cc
│   │   ├── cached_op_threadsafe.h
│   │   ├── cuda_graphs.h
│   │   ├── eliminate_common_expr_pass.cc
│   │   ├── exec_pass.h
│   │   ├── imperative.cc
│   │   ├── imperative_utils.cc
│   │   ├── imperative_utils.h
│   │   ├── infer_graph_attr_pass.cc
│   │   ├── inplace_addto_detect_pass.cc
│   │   ├── naive_cached_op.cc
│   │   ├── naive_cached_op.h
│   │   ├── pointwise_fusion_pass.cc
│   │   ├── simple_partition_pass.cc
│   │   └── simple_partition_pass.h
│   ├── initialize.cc
│   ├── initialize.h
│   ├── io/
│   │   ├── batchify.cc
│   │   ├── dataloader.cc
│   │   ├── dataset.cc
│   │   ├── image_aug_default.cc
│   │   ├── image_augmenter.h
│   │   ├── image_det_aug_default.cc
│   │   ├── image_io.cc
│   │   ├── image_iter_common.h
│   │   ├── image_recordio.h
│   │   ├── inst_vector.h
│   │   ├── io.cc
│   │   ├── iter_batchloader.h
│   │   ├── iter_csv.cc
│   │   ├── iter_image_det_recordio.cc
│   │   ├── iter_image_recordio.cc
│   │   ├── iter_image_recordio_2.cc
│   │   ├── iter_libsvm.cc
│   │   ├── iter_mnist.cc
│   │   ├── iter_normalize.h
│   │   ├── iter_prefetcher.h
│   │   ├── iter_sampler.cc
│   │   ├── iter_sparse.h
│   │   ├── iter_sparse_batchloader.h
│   │   ├── iter_sparse_prefetcher.h
│   │   └── opencv_compatibility.h
│   ├── ir/
│   │   └── expr.cc
│   ├── kvstore/
│   │   ├── comm.h
│   │   ├── comm_tree.h
│   │   ├── gpu_topology.h
│   │   ├── gradient_compression-inl.h
│   │   ├── gradient_compression.cc
│   │   ├── gradient_compression.cu
│   │   ├── gradient_compression.h
│   │   ├── kvstore.cc
│   │   ├── kvstore_dist.h
│   │   ├── kvstore_dist_server.h
│   │   ├── kvstore_local.h
│   │   ├── kvstore_nccl.h
│   │   ├── kvstore_utils.cc
│   │   ├── kvstore_utils.cu
│   │   ├── kvstore_utils.h
│   │   └── p3store_dist.h
│   ├── lang/
│   │   ├── expr.cc
│   │   └── ir.cc
│   ├── lib_api.cc
│   ├── libinfo.cc
│   ├── ndarray/
│   │   ├── ndarray.cc
│   │   ├── ndarray_function-inl.cuh
│   │   ├── ndarray_function-inl.h
│   │   ├── ndarray_function.cc
│   │   ├── ndarray_function.cu
│   │   └── ndarray_function.h
│   ├── nnvm/
│   │   ├── error.h
│   │   ├── gradient.cc
│   │   ├── graph_algorithm.h
│   │   ├── graph_editor.cc
│   │   ├── legacy_json_util.cc
│   │   ├── legacy_op_util.cc
│   │   ├── low_precision_pass.cc
│   │   ├── node_op_util.h
│   │   ├── plan_memory.cc
│   │   └── tvm_bridge.cc
│   ├── operator/
│   │   ├── all_finite-inl.h
│   │   ├── all_finite.cc
│   │   ├── all_finite.cu
│   │   ├── amp_graph_pass.cc
│   │   ├── bilinear_sampler-inl.h
│   │   ├── bilinear_sampler.cc
│   │   ├── bilinear_sampler.cu
│   │   ├── c_lapack_api.cc
│   │   ├── c_lapack_api.h
│   │   ├── channel_op_common.h
│   │   ├── contrib/
│   │   │   ├── adabelief-inl.h
│   │   │   ├── adabelief.cc
│   │   │   ├── adabelief.cu
│   │   │   ├── adamw-inl.h
│   │   │   ├── adamw.cc
│   │   │   ├── adamw.cu
│   │   │   ├── adaptive_avg_pooling-inl.h
│   │   │   ├── adaptive_avg_pooling.cc
│   │   │   ├── adaptive_avg_pooling.cu
│   │   │   ├── allclose_op-inl.h
│   │   │   ├── allclose_op.cc
│   │   │   ├── allclose_op.cu
│   │   │   ├── bilinear_resize-inl.cuh
│   │   │   ├── bilinear_resize-inl.h
│   │   │   ├── bilinear_resize.cc
│   │   │   ├── bilinear_resize.cu
│   │   │   ├── boolean_mask-inl.h
│   │   │   ├── boolean_mask.cc
│   │   │   ├── boolean_mask.cu
│   │   │   ├── bounding_box-common.h
│   │   │   ├── bounding_box-inl.cuh
│   │   │   ├── bounding_box-inl.h
│   │   │   ├── bounding_box.cc
│   │   │   ├── bounding_box.cu
│   │   │   ├── count_sketch-inl.h
│   │   │   ├── count_sketch.cc
│   │   │   ├── count_sketch.cu
│   │   │   ├── deformable_psroi_pooling-inl.h
│   │   │   ├── deformable_psroi_pooling.cc
│   │   │   ├── deformable_psroi_pooling.cu
│   │   │   ├── dgl_graph-inl.h
│   │   │   ├── dgl_graph.cc
│   │   │   ├── dgl_graph.cu
│   │   │   ├── dynamic_shape_ops-inl.h
│   │   │   ├── dynamic_shape_ops.cc
│   │   │   ├── erfinv-inl.h
│   │   │   ├── fft-inl.h
│   │   │   ├── fft.cc
│   │   │   ├── fft.cu
│   │   │   ├── gradient_multiplier_op.cc
│   │   │   ├── gradient_multiplier_op.cu
│   │   │   ├── hawkes_ll-inl.h
│   │   │   ├── hawkes_ll.cc
│   │   │   ├── hawkes_ll.cu
│   │   │   ├── index_array-inl.h
│   │   │   ├── index_array.cc
│   │   │   ├── index_array.cu
│   │   │   ├── index_copy-inl.h
│   │   │   ├── index_copy.cc
│   │   │   ├── index_copy.cu
│   │   │   ├── intgemm/
│   │   │   │   ├── intgemm_fully_connected_op.cc
│   │   │   │   ├── max_absolute_op.cc
│   │   │   │   ├── prepare_data_op.cc
│   │   │   │   ├── prepare_weight_op.cc
│   │   │   │   └── take_weight_op.cc
│   │   │   ├── krprod.cc
│   │   │   ├── krprod.h
│   │   │   ├── mrcnn_mask_target-inl.h
│   │   │   ├── mrcnn_mask_target.cu
│   │   │   ├── multi_lamb-inl.h
│   │   │   ├── multi_lamb.cc
│   │   │   ├── multi_lamb.cu
│   │   │   ├── multi_lans-inl.h
│   │   │   ├── multi_lans.cc
│   │   │   ├── multi_lans.cu
│   │   │   ├── multi_lars-inl.h
│   │   │   ├── multi_lars.cc
│   │   │   ├── multi_lars.cu
│   │   │   ├── multi_proposal-inl.h
│   │   │   ├── multi_proposal.cc
│   │   │   ├── multi_proposal.cu
│   │   │   ├── multi_sum_sq-inl.h
│   │   │   ├── multi_sum_sq.cc
│   │   │   ├── multi_sum_sq.cu
│   │   │   ├── multibox_detection-inl.h
│   │   │   ├── multibox_detection.cc
│   │   │   ├── multibox_detection.cu
│   │   │   ├── multibox_prior-inl.h
│   │   │   ├── multibox_prior.cc
│   │   │   ├── multibox_prior.cu
│   │   │   ├── multibox_target-inl.h
│   │   │   ├── multibox_target.cc
│   │   │   ├── multibox_target.cu
│   │   │   ├── nn/
│   │   │   │   ├── deformable_im2col.cuh
│   │   │   │   ├── deformable_im2col.h
│   │   │   │   ├── modulated_deformable_im2col.cuh
│   │   │   │   └── modulated_deformable_im2col.h
│   │   │   ├── nnz.cc
│   │   │   ├── optimizer_op-inl.h
│   │   │   ├── optimizer_op.cc
│   │   │   ├── optimizer_op.cu
│   │   │   ├── preloaded_multi_sgd-inl.h
│   │   │   ├── preloaded_multi_sgd.cc
│   │   │   ├── preloaded_multi_sgd.cu
│   │   │   ├── proposal-inl.h
│   │   │   ├── proposal.cc
│   │   │   ├── proposal.cu
│   │   │   ├── psroi_pooling-inl.h
│   │   │   ├── psroi_pooling.cc
│   │   │   ├── psroi_pooling.cu
│   │   │   ├── quadratic_op-inl.h
│   │   │   ├── quadratic_op.cc
│   │   │   ├── quadratic_op.cu
│   │   │   ├── reset_arrays-inl.h
│   │   │   ├── reset_arrays.cc
│   │   │   ├── reset_arrays.cu
│   │   │   ├── roi_align-inl.h
│   │   │   ├── roi_align.cc
│   │   │   ├── roi_align.cu
│   │   │   ├── rroi_align-inl.h
│   │   │   ├── rroi_align.cc
│   │   │   ├── stes_op.cc
│   │   │   ├── stes_op.cu
│   │   │   ├── stes_op.h
│   │   │   ├── sync_batch_norm-inl.h
│   │   │   ├── sync_batch_norm.cc
│   │   │   ├── sync_batch_norm.cu
│   │   │   ├── transformer-inl.h
│   │   │   ├── transformer.cc
│   │   │   ├── transformer.cu
│   │   │   └── tvmop/
│   │   │       ├── dot.cc
│   │   │       └── ufunc.cc
│   │   ├── control_flow.cc
│   │   ├── correlation-inl.h
│   │   ├── correlation.cc
│   │   ├── correlation.cu
│   │   ├── crop-inl.h
│   │   ├── crop.cc
│   │   ├── crop.cu
│   │   ├── cross_device_copy.cc
│   │   ├── cudnn_bilinear_sampler-inl.h
│   │   ├── cudnn_lrn-inl.h
│   │   ├── cudnn_ops.cc
│   │   ├── cudnn_ops.h
│   │   ├── cudnn_spatial_transformer-inl.h
│   │   ├── custom/
│   │   │   ├── custom-inl.h
│   │   │   ├── custom.cc
│   │   │   ├── native_op-inl.h
│   │   │   ├── native_op.cc
│   │   │   ├── native_op.cu
│   │   │   ├── ndarray_op-inl.h
│   │   │   └── ndarray_op.cc
│   │   ├── deformable_convolution-inl.h
│   │   ├── deformable_convolution.cc
│   │   ├── deformable_convolution.cu
│   │   ├── elemwise_op_common.h
│   │   ├── fusion/
│   │   │   ├── fused_op-inl.h
│   │   │   ├── fused_op.cc
│   │   │   ├── fused_op.cu
│   │   │   └── fused_op.h
│   │   ├── grid_generator-inl.h
│   │   ├── grid_generator.cc
│   │   ├── grid_generator.cu
│   │   ├── identity_attach_KL_sparse_reg-inl.h
│   │   ├── identity_attach_KL_sparse_reg.cc
│   │   ├── identity_attach_KL_sparse_reg.cu
│   │   ├── image/
│   │   │   ├── crop-inl.h
│   │   │   ├── crop.cc
│   │   │   ├── crop.cu
│   │   │   ├── image_random-inl.h
│   │   │   ├── image_random.cc
│   │   │   ├── image_random.cu
│   │   │   ├── image_utils.h
│   │   │   ├── resize-inl.h
│   │   │   ├── resize.cc
│   │   │   └── resize.cu
│   │   ├── instance_norm-inl.h
│   │   ├── instance_norm.cc
│   │   ├── instance_norm.cu
│   │   ├── l2_normalization-inl.h
│   │   ├── l2_normalization.cc
│   │   ├── l2_normalization.cu
│   │   ├── leaky_relu-inl.h
│   │   ├── leaky_relu.cc
│   │   ├── leaky_relu.cu
│   │   ├── linalg.h
│   │   ├── linalg_impl.h
│   │   ├── loss_binary_op-inl.h
│   │   ├── loss_binary_op.cc
│   │   ├── loss_binary_op.cu
│   │   ├── make_loss-inl.h
│   │   ├── make_loss.cc
│   │   ├── make_loss.cu
│   │   ├── math_functions-inl.h
│   │   ├── mkl_functions-inl.h
│   │   ├── modulated_deformable_convolution-inl.h
│   │   ├── modulated_deformable_convolution.cc
│   │   ├── modulated_deformable_convolution.cu
│   │   ├── mshadow_op.h
│   │   ├── mxnet_op.h
│   │   ├── nn/
│   │   │   ├── activation-inl.h
│   │   │   ├── activation.cc
│   │   │   ├── activation.cu
│   │   │   ├── batch_norm-inl.h
│   │   │   ├── batch_norm.cc
│   │   │   ├── batch_norm.cu
│   │   │   ├── concat-inl.h
│   │   │   ├── concat.cc
│   │   │   ├── concat.cu
│   │   │   ├── convolution-inl.h
│   │   │   ├── convolution.cc
│   │   │   ├── convolution.cu
│   │   │   ├── ctc_loss-inl.h
│   │   │   ├── ctc_loss.cc
│   │   │   ├── ctc_loss.cu
│   │   │   ├── cudnn/
│   │   │   │   ├── cudnn_activation-inl.h
│   │   │   │   ├── cudnn_algoreg-inl.h
│   │   │   │   ├── cudnn_algoreg.cc
│   │   │   │   ├── cudnn_batch_norm.cu
│   │   │   │   ├── cudnn_batch_norm.h
│   │   │   │   ├── cudnn_convolution-inl.h
│   │   │   │   ├── cudnn_deconvolution-inl.h
│   │   │   │   ├── cudnn_pooling-inl.h
│   │   │   │   └── cudnn_softmax_activation-inl.h
│   │   │   ├── deconvolution-inl.h
│   │   │   ├── deconvolution.cc
│   │   │   ├── deconvolution.cu
│   │   │   ├── depthwise_convolution-inl.h
│   │   │   ├── depthwise_convolution_tf.cuh
│   │   │   ├── dnnl/
│   │   │   │   ├── dnnl_act-inl.h
│   │   │   │   ├── dnnl_act.cc
│   │   │   │   ├── dnnl_base-inl.h
│   │   │   │   ├── dnnl_base.cc
│   │   │   │   ├── dnnl_batch_dot-inl.h
│   │   │   │   ├── dnnl_batch_dot.cc
│   │   │   │   ├── dnnl_batch_norm-inl.h
│   │   │   │   ├── dnnl_batch_norm.cc
│   │   │   │   ├── dnnl_binary-inl.h
│   │   │   │   ├── dnnl_binary.cc
│   │   │   │   ├── dnnl_concat-inl.h
│   │   │   │   ├── dnnl_concat.cc
│   │   │   │   ├── dnnl_convolution-inl.h
│   │   │   │   ├── dnnl_convolution.cc
│   │   │   │   ├── dnnl_copy-inl.h
│   │   │   │   ├── dnnl_copy.cc
│   │   │   │   ├── dnnl_deconvolution-inl.h
│   │   │   │   ├── dnnl_deconvolution.cc
│   │   │   │   ├── dnnl_dot-inl.h
│   │   │   │   ├── dnnl_dot.cc
│   │   │   │   ├── dnnl_eltwise-inl.h
│   │   │   │   ├── dnnl_eltwise.cc
│   │   │   │   ├── dnnl_fully_connected-inl.h
│   │   │   │   ├── dnnl_fully_connected.cc
│   │   │   │   ├── dnnl_layer_norm-inl.h
│   │   │   │   ├── dnnl_layer_norm.cc
│   │   │   │   ├── dnnl_log_softmax.cc
│   │   │   │   ├── dnnl_lrn-inl.h
│   │   │   │   ├── dnnl_masked_softmax-inl.h
│   │   │   │   ├── dnnl_masked_softmax.cc
│   │   │   │   ├── dnnl_pooling-inl.h
│   │   │   │   ├── dnnl_pooling.cc
│   │   │   │   ├── dnnl_pow_mul_scalar-inl.h
│   │   │   │   ├── dnnl_pow_mul_scalar.cc
│   │   │   │   ├── dnnl_reduce-inl.h
│   │   │   │   ├── dnnl_reduce.cc
│   │   │   │   ├── dnnl_reshape-inl.h
│   │   │   │   ├── dnnl_reshape.cc
│   │   │   │   ├── dnnl_rnn-inl.h
│   │   │   │   ├── dnnl_rnn.cc
│   │   │   │   ├── dnnl_softmax-inl.h
│   │   │   │   ├── dnnl_softmax.cc
│   │   │   │   ├── dnnl_softmax_output-inl.h
│   │   │   │   ├── dnnl_softmax_output.cc
│   │   │   │   ├── dnnl_split-inl.h
│   │   │   │   ├── dnnl_split.cc
│   │   │   │   ├── dnnl_stack-inl.h
│   │   │   │   ├── dnnl_stack.cc
│   │   │   │   ├── dnnl_sum-inl.h
│   │   │   │   ├── dnnl_sum.cc
│   │   │   │   ├── dnnl_transpose-inl.h
│   │   │   │   ├── dnnl_transpose.cc
│   │   │   │   ├── dnnl_where-inl.h
│   │   │   │   └── dnnl_where.cc
│   │   │   ├── dropout-inl.h
│   │   │   ├── dropout.cc
│   │   │   ├── dropout.cu
│   │   │   ├── fully_connected-inl.h
│   │   │   ├── fully_connected.cc
│   │   │   ├── fully_connected.cu
│   │   │   ├── group_norm-inl.h
│   │   │   ├── group_norm.cc
│   │   │   ├── group_norm.cu
│   │   │   ├── im2col-inl.h
│   │   │   ├── im2col.cc
│   │   │   ├── im2col.cu
│   │   │   ├── im2col.cuh
│   │   │   ├── im2col.h
│   │   │   ├── layer_norm-inl.h
│   │   │   ├── layer_norm.cc
│   │   │   ├── layer_norm.cu
│   │   │   ├── layer_norm_cpu.h
│   │   │   ├── log_softmax.cc
│   │   │   ├── log_softmax.cu
│   │   │   ├── lrn-inl.h
│   │   │   ├── lrn.cc
│   │   │   ├── lrn.cu
│   │   │   ├── masked_softmax.cc
│   │   │   ├── moments-inl.h
│   │   │   ├── moments.cc
│   │   │   ├── moments.cu
│   │   │   ├── pool.cuh
│   │   │   ├── pool.h
│   │   │   ├── pool_utils.h
│   │   │   ├── pooling-inl.h
│   │   │   ├── pooling.cc
│   │   │   ├── pooling.cu
│   │   │   ├── sequence_mask-inl.h
│   │   │   ├── softmax-inl.h
│   │   │   ├── softmax.cc
│   │   │   ├── softmax.cu
│   │   │   ├── softmax_activation-inl.h
│   │   │   ├── softmax_activation.cc
│   │   │   ├── softmax_activation.cu
│   │   │   ├── softmin.cc
│   │   │   ├── softmin.cu
│   │   │   ├── upsampling-inl.h
│   │   │   ├── upsampling.cc
│   │   │   └── upsampling.cu
│   │   ├── npx_control_flow.cc
│   │   ├── npx_control_flow.h
│   │   ├── numpy/
│   │   │   ├── linalg/
│   │   │   │   ├── broadcast_reduce_customized-inl.h
│   │   │   │   ├── broadcast_reduce_op_customized.h
│   │   │   │   ├── np_eig-inl.h
│   │   │   │   ├── np_eig.cc
│   │   │   │   ├── np_eig.cu
│   │   │   │   ├── np_eigvals-inl.h
│   │   │   │   ├── np_eigvals.cc
│   │   │   │   ├── np_eigvals.cu
│   │   │   │   ├── np_gesvd-inl.h
│   │   │   │   ├── np_gesvd.cc
│   │   │   │   ├── np_gesvd.cu
│   │   │   │   ├── np_lstsq-inl.h
│   │   │   │   ├── np_lstsq.cc
│   │   │   │   ├── np_lstsq.cu
│   │   │   │   ├── np_matrix_rank-inl.h
│   │   │   │   ├── np_matrix_rank.cc
│   │   │   │   ├── np_matrix_rank.cu
│   │   │   │   ├── np_norm-inl.h
│   │   │   │   ├── np_norm.cc
│   │   │   │   ├── np_norm_backward.cc
│   │   │   │   ├── np_norm_backward.cu
│   │   │   │   ├── np_norm_forward.cc
│   │   │   │   ├── np_norm_forward.cu
│   │   │   │   ├── np_pinv-inl.h
│   │   │   │   ├── np_pinv.cc
│   │   │   │   ├── np_pinv.cu
│   │   │   │   ├── np_potrf-inl.h
│   │   │   │   ├── np_potrf.cc
│   │   │   │   ├── np_potrf.cu
│   │   │   │   ├── np_qr-inl.h
│   │   │   │   ├── np_qr.cc
│   │   │   │   ├── np_qr.cu
│   │   │   │   ├── np_solve-inl.h
│   │   │   │   ├── np_solve.cc
│   │   │   │   ├── np_solve.cu
│   │   │   │   ├── np_tensorinv-inl.h
│   │   │   │   ├── np_tensorinv.cc
│   │   │   │   ├── np_tensorinv.cu
│   │   │   │   ├── np_tensorsolve-inl.h
│   │   │   │   ├── np_tensorsolve.cc
│   │   │   │   └── np_tensorsolve.cu
│   │   │   ├── np_bincount_op-inl.h
│   │   │   ├── np_bincount_op.cc
│   │   │   ├── np_bincount_op.cu
│   │   │   ├── np_boolean_mask_assign.cc
│   │   │   ├── np_boolean_mask_assign.cu
│   │   │   ├── np_broadcast_reduce_op.cc
│   │   │   ├── np_broadcast_reduce_op.h
│   │   │   ├── np_broadcast_reduce_op_boolean.cc
│   │   │   ├── np_broadcast_reduce_op_boolean.cu
│   │   │   ├── np_broadcast_reduce_op_index.cc
│   │   │   ├── np_broadcast_reduce_op_index.cu
│   │   │   ├── np_broadcast_reduce_op_value.h
│   │   │   ├── np_broadcast_reduce_op_value_broadcast_to.cc
│   │   │   ├── np_broadcast_reduce_op_value_broadcast_to.cu
│   │   │   ├── np_broadcast_reduce_op_value_max.cc
│   │   │   ├── np_broadcast_reduce_op_value_max.cu
│   │   │   ├── np_broadcast_reduce_op_value_mean.cc
│   │   │   ├── np_broadcast_reduce_op_value_mean.cu
│   │   │   ├── np_broadcast_reduce_op_value_min.cc
│   │   │   ├── np_broadcast_reduce_op_value_min.cu
│   │   │   ├── np_broadcast_reduce_op_value_prod.cc
│   │   │   ├── np_broadcast_reduce_op_value_prod.cu
│   │   │   ├── np_broadcast_reduce_op_value_sum.cc
│   │   │   ├── np_broadcast_reduce_op_value_sum.cu
│   │   │   ├── np_constraint_check.cc
│   │   │   ├── np_constraint_check.cu
│   │   │   ├── np_constraint_check.h
│   │   │   ├── np_cross-inl.h
│   │   │   ├── np_cross.cc
│   │   │   ├── np_cross.cu
│   │   │   ├── np_cumsum-inl.h
│   │   │   ├── np_cumsum.cc
│   │   │   ├── np_cumsum.cu
│   │   │   ├── np_delete_op-inl.h
│   │   │   ├── np_delete_op.cc
│   │   │   ├── np_delete_op.cu
│   │   │   ├── np_diff-inl.h
│   │   │   ├── np_diff.cc
│   │   │   ├── np_diff.cu
│   │   │   ├── np_dot-inl.h
│   │   │   ├── np_dot_backward.cc
│   │   │   ├── np_dot_backward.cu
│   │   │   ├── np_dot_forward.cc
│   │   │   ├── np_dot_forward.cu
│   │   │   ├── np_ediff1d_op-inl.h
│   │   │   ├── np_ediff1d_op.cc
│   │   │   ├── np_ediff1d_op.cu
│   │   │   ├── np_einsum_op-inl.h
│   │   │   ├── np_einsum_op.cc
│   │   │   ├── np_einsum_op.cu
│   │   │   ├── np_einsum_path_op-inl.h
│   │   │   ├── np_elemwise_broadcast_logic_op.h
│   │   │   ├── np_elemwise_broadcast_logic_op_and.cc
│   │   │   ├── np_elemwise_broadcast_logic_op_and.cu
│   │   │   ├── np_elemwise_broadcast_logic_op_equal.cc
│   │   │   ├── np_elemwise_broadcast_logic_op_equal.cu
│   │   │   ├── np_elemwise_broadcast_logic_op_greater.cc
│   │   │   ├── np_elemwise_broadcast_logic_op_greater.cu
│   │   │   ├── np_elemwise_broadcast_logic_op_greater_equal.cc
│   │   │   ├── np_elemwise_broadcast_logic_op_greater_equal.cu
│   │   │   ├── np_elemwise_broadcast_logic_op_less.cc
│   │   │   ├── np_elemwise_broadcast_logic_op_less.cu
│   │   │   ├── np_elemwise_broadcast_logic_op_less_equal.cc
│   │   │   ├── np_elemwise_broadcast_logic_op_less_equal.cu
│   │   │   ├── np_elemwise_broadcast_logic_op_not_equal.cc
│   │   │   ├── np_elemwise_broadcast_logic_op_not_equal.cu
│   │   │   ├── np_elemwise_broadcast_logic_op_or.cc
│   │   │   ├── np_elemwise_broadcast_logic_op_or.cu
│   │   │   ├── np_elemwise_broadcast_logic_op_xor.cc
│   │   │   ├── np_elemwise_broadcast_logic_op_xor.cu
│   │   │   ├── np_elemwise_broadcast_op.h
│   │   │   ├── np_elemwise_broadcast_op_add.cc
│   │   │   ├── np_elemwise_broadcast_op_add.cu
│   │   │   ├── np_elemwise_broadcast_op_extended.cc
│   │   │   ├── np_elemwise_broadcast_op_extended.cu
│   │   │   ├── np_elemwise_broadcast_op_extended_sec.cc
│   │   │   ├── np_elemwise_broadcast_op_extended_sec.cu
│   │   │   ├── np_elemwise_broadcast_op_extended_thi.cc
│   │   │   ├── np_elemwise_broadcast_op_extended_thi.cu
│   │   │   ├── np_elemwise_broadcast_op_lae.cc
│   │   │   ├── np_elemwise_broadcast_op_lae.cu
│   │   │   ├── np_elemwise_broadcast_op_mod.cc
│   │   │   ├── np_elemwise_broadcast_op_mod.cu
│   │   │   ├── np_elemwise_broadcast_op_mul.cc
│   │   │   ├── np_elemwise_broadcast_op_mul.cu
│   │   │   ├── np_elemwise_broadcast_op_pow.cc
│   │   │   ├── np_elemwise_broadcast_op_pow.cu
│   │   │   ├── np_elemwise_broadcast_op_scalar.cc
│   │   │   ├── np_elemwise_broadcast_op_scalar.cu
│   │   │   ├── np_elemwise_broadcast_op_sub.cc
│   │   │   ├── np_elemwise_broadcast_op_sub.cu
│   │   │   ├── np_elemwise_unary_op_basic.cc
│   │   │   ├── np_elemwise_unary_op_basic.cu
│   │   │   ├── np_fill_diagonal_op-inl.h
│   │   │   ├── np_fill_diagonal_op.cc
│   │   │   ├── np_fill_diagonal_op.cu
│   │   │   ├── np_floor_divide.cc
│   │   │   ├── np_floor_divide.cu
│   │   │   ├── np_indexing_op.cc
│   │   │   ├── np_indexing_op.cu
│   │   │   ├── np_indexing_op.h
│   │   │   ├── np_init_op.cc
│   │   │   ├── np_init_op.cu
│   │   │   ├── np_init_op.h
│   │   │   ├── np_insert_op-inl.h
│   │   │   ├── np_insert_op_scalar-inl.h
│   │   │   ├── np_insert_op_scalar.cc
│   │   │   ├── np_insert_op_scalar.cu
│   │   │   ├── np_insert_op_slice-inl.h
│   │   │   ├── np_insert_op_slice.cc
│   │   │   ├── np_insert_op_slice.cu
│   │   │   ├── np_insert_op_tensor-inl.h
│   │   │   ├── np_insert_op_tensor.cc
│   │   │   ├── np_insert_op_tensor.cu
│   │   │   ├── np_interp_op-inl.h
│   │   │   ├── np_interp_op.cc
│   │   │   ├── np_interp_op.cu
│   │   │   ├── np_kron-inl.h
│   │   │   ├── np_kron_backward.cc
│   │   │   ├── np_kron_backward.cu
│   │   │   ├── np_kron_forward.cc
│   │   │   ├── np_kron_forward.cu
│   │   │   ├── np_matmul_op-inl.h
│   │   │   ├── np_matmul_op.cc
│   │   │   ├── np_matmul_op.cu
│   │   │   ├── np_matrix_op-inl.h
│   │   │   ├── np_matrix_op.cc
│   │   │   ├── np_matrix_op.cu
│   │   │   ├── np_memory_op.cc
│   │   │   ├── np_memory_op.cu
│   │   │   ├── np_memory_op.h
│   │   │   ├── np_moments_op.cc
│   │   │   ├── np_moments_op.cu
│   │   │   ├── np_nonzero_op-inl.h
│   │   │   ├── np_nonzero_op.cc
│   │   │   ├── np_nonzero_op.cu
│   │   │   ├── np_pad_op-inl.h
│   │   │   ├── np_pad_op.cc
│   │   │   ├── np_pad_op.cu
│   │   │   ├── np_percentile_op-inl.h
│   │   │   ├── np_percentile_op.cc
│   │   │   ├── np_percentile_op.cu
│   │   │   ├── np_polynomial_op-inl.h
│   │   │   ├── np_polynomial_op.cc
│   │   │   ├── np_polynomial_op.cu
│   │   │   ├── np_repeat_op-inl.h
│   │   │   ├── np_repeat_op.cc
│   │   │   ├── np_repeat_op.cu
│   │   │   ├── np_tensordot_op-inl.h
│   │   │   ├── np_tensordot_op.cc
│   │   │   ├── np_tensordot_op.cu
│   │   │   ├── np_trace_op-inl.h
│   │   │   ├── np_trace_op.cc
│   │   │   ├── np_trace_op.cu
│   │   │   ├── np_tri_op-inl.h
│   │   │   ├── np_tri_op.cc
│   │   │   ├── np_tri_op.cu
│   │   │   ├── np_tril_op-inl.h
│   │   │   ├── np_tril_op.cc
│   │   │   ├── np_tril_op.cu
│   │   │   ├── np_triu_op-inl.h
│   │   │   ├── np_triu_op.cc
│   │   │   ├── np_triu_op.cu
│   │   │   ├── np_true_divide-inl.h
│   │   │   ├── np_true_divide.cc
│   │   │   ├── np_true_divide.cu
│   │   │   ├── np_unique_op.cc
│   │   │   ├── np_unique_op.cu
│   │   │   ├── np_unique_op.h
│   │   │   ├── np_where_backward_op.cc
│   │   │   ├── np_where_backward_op.cu
│   │   │   ├── np_where_forward_op.cc
│   │   │   ├── np_where_forward_op.cu
│   │   │   ├── np_where_op-inl.h
│   │   │   ├── np_window_op.cc
│   │   │   ├── np_window_op.cu
│   │   │   ├── np_window_op.h
│   │   │   └── random/
│   │   │       ├── dist_common.cc
│   │   │       ├── dist_common.cu
│   │   │       ├── dist_common.h
│   │   │       ├── np_bernoulli_op.cc
│   │   │       ├── np_bernoulli_op.cu
│   │   │       ├── np_bernoulli_op.h
│   │   │       ├── np_choice_op.cc
│   │   │       ├── np_choice_op.cu
│   │   │       ├── np_choice_op.h
│   │   │       ├── np_exponential_op.cc
│   │   │       ├── np_exponential_op.cu
│   │   │       ├── np_exponential_op.h
│   │   │       ├── np_gamma_op.cc
│   │   │       ├── np_gamma_op.cu
│   │   │       ├── np_gamma_op.h
│   │   │       ├── np_laplace_op.cc
│   │   │       ├── np_laplace_op.cu
│   │   │       ├── np_laplace_op.h
│   │   │       ├── np_location_scale_op.cc
│   │   │       ├── np_location_scale_op.cu
│   │   │       ├── np_location_scale_op.h
│   │   │       ├── np_multinomial_op.cc
│   │   │       ├── np_multinomial_op.cu
│   │   │       ├── np_multinomial_op.h
│   │   │       ├── np_normal_op.cc
│   │   │       ├── np_normal_op.cu
│   │   │       ├── np_normal_op.h
│   │   │       ├── np_pareto_op.cc
│   │   │       ├── np_pareto_op.cu
│   │   │       ├── np_pareto_op.h
│   │   │       ├── np_power_op.cc
│   │   │       ├── np_power_op.cu
│   │   │       ├── np_power_op.h
│   │   │       ├── np_rayleigh_op.cc
│   │   │       ├── np_rayleigh_op.cu
│   │   │       ├── np_rayleigh_op.h
│   │   │       ├── np_uniform_op.cc
│   │   │       ├── np_uniform_op.cu
│   │   │       ├── np_uniform_op.h
│   │   │       ├── np_weibull_op.cc
│   │   │       ├── np_weibull_op.cu
│   │   │       └── np_weibull_op.h
│   │   ├── operator.cc
│   │   ├── operator_common.h
│   │   ├── operator_tune-inl.h
│   │   ├── operator_tune.cc
│   │   ├── operator_tune.h
│   │   ├── operator_util.cc
│   │   ├── optimizer_op-inl.h
│   │   ├── optimizer_op.cc
│   │   ├── optimizer_op.cu
│   │   ├── pad-inl.h
│   │   ├── pad.cc
│   │   ├── pad.cu
│   │   ├── quantization/
│   │   │   ├── calibrate-inl.h
│   │   │   ├── calibrate.cc
│   │   │   ├── dequantize-inl.h
│   │   │   ├── dequantize.cc
│   │   │   ├── dequantize.cu
│   │   │   ├── dnnl/
│   │   │   │   ├── dnnl_dequantize-inl.h
│   │   │   │   ├── dnnl_quantize-inl.h
│   │   │   │   ├── dnnl_quantize_asym-inl.h
│   │   │   │   ├── dnnl_quantize_v2-inl.h
│   │   │   │   ├── dnnl_quantized_act.cc
│   │   │   │   ├── dnnl_quantized_batch_norm.cc
│   │   │   │   ├── dnnl_quantized_concat.cc
│   │   │   │   ├── dnnl_quantized_conv.cc
│   │   │   │   ├── dnnl_quantized_elemwise_add.cc
│   │   │   │   ├── dnnl_quantized_flatten.cc
│   │   │   │   ├── dnnl_quantized_fully_connected.cc
│   │   │   │   ├── dnnl_quantized_ops-inl.h
│   │   │   │   ├── dnnl_quantized_pooling.cc
│   │   │   │   ├── dnnl_quantized_reshape.cc
│   │   │   │   ├── dnnl_quantized_rnn-inl.h
│   │   │   │   ├── dnnl_quantized_rnn.cc
│   │   │   │   ├── dnnl_quantized_transpose.cc
│   │   │   │   └── dnnl_requantize-inl.h
│   │   │   ├── quantization_utils.h
│   │   │   ├── quantize-inl.h
│   │   │   ├── quantize.cc
│   │   │   ├── quantize.cu
│   │   │   ├── quantize_asym-inl.h
│   │   │   ├── quantize_asym.cc
│   │   │   ├── quantize_graph_pass.cc
│   │   │   ├── quantize_v2-inl.h
│   │   │   ├── quantize_v2.cc
│   │   │   ├── quantize_v2.cu
│   │   │   ├── quantized_activation.cc
│   │   │   ├── quantized_batch_norm.cc
│   │   │   ├── quantized_batch_norm_relu.cc
│   │   │   ├── quantized_concat.cc
│   │   │   ├── quantized_conv.cc
│   │   │   ├── quantized_conv.cu
│   │   │   ├── quantized_elemwise_add-inl.h
│   │   │   ├── quantized_elemwise_add.cc
│   │   │   ├── quantized_elemwise_mul-inl.h
│   │   │   ├── quantized_elemwise_mul.cc
│   │   │   ├── quantized_flatten-inl.h
│   │   │   ├── quantized_flatten.cc
│   │   │   ├── quantized_flatten.cu
│   │   │   ├── quantized_fully_connected.cc
│   │   │   ├── quantized_fully_connected.cu
│   │   │   ├── quantized_indexing_op.cc
│   │   │   ├── quantized_pooling.cc
│   │   │   ├── quantized_pooling.cu
│   │   │   ├── quantized_reshape-inl.h
│   │   │   ├── quantized_reshape.cc
│   │   │   ├── quantized_rnn-inl.h
│   │   │   ├── quantized_rnn.cc
│   │   │   ├── quantized_transpose.cc
│   │   │   ├── requantize-inl.h
│   │   │   ├── requantize.cc
│   │   │   └── requantize.cu
│   │   ├── random/
│   │   │   ├── multisample_op.cc
│   │   │   ├── multisample_op.cu
│   │   │   ├── multisample_op.h
│   │   │   ├── pdf_op.cc
│   │   │   ├── pdf_op.cu
│   │   │   ├── pdf_op.h
│   │   │   ├── sample_multinomial_op.cc
│   │   │   ├── sample_multinomial_op.cu
│   │   │   ├── sample_multinomial_op.h
│   │   │   ├── sample_op.cc
│   │   │   ├── sample_op.cu
│   │   │   ├── sample_op.h
│   │   │   ├── sampler.h
│   │   │   ├── shuffle_op.cc
│   │   │   ├── shuffle_op.cu
│   │   │   ├── unique_sample_op.cc
│   │   │   └── unique_sample_op.h
│   │   ├── regression_output-inl.h
│   │   ├── regression_output.cc
│   │   ├── regression_output.cu
│   │   ├── rnn-inl.h
│   │   ├── rnn.cc
│   │   ├── rnn.cu
│   │   ├── rnn_impl.h
│   │   ├── roi_pooling-inl.h
│   │   ├── roi_pooling.cc
│   │   ├── roi_pooling.cu
│   │   ├── sequence_last-inl.h
│   │   ├── sequence_last.cc
│   │   ├── sequence_last.cu
│   │   ├── sequence_mask-inl.h
│   │   ├── sequence_mask.cc
│   │   ├── sequence_mask.cu
│   │   ├── sequence_op_common.h
│   │   ├── sequence_reverse-inl.h
│   │   ├── sequence_reverse.cc
│   │   ├── sequence_reverse.cu
│   │   ├── slice_channel-inl.h
│   │   ├── slice_channel.cc
│   │   ├── slice_channel.cu
│   │   ├── softmax_output-inl.h
│   │   ├── softmax_output.cc
│   │   ├── softmax_output.cu
│   │   ├── spatial_transformer-inl.h
│   │   ├── spatial_transformer.cc
│   │   ├── spatial_transformer.cu
│   │   ├── special_functions-inl.h
│   │   ├── subgraph/
│   │   │   ├── build_subgraph.cc
│   │   │   ├── common.h
│   │   │   ├── default_subgraph_property.cc
│   │   │   ├── default_subgraph_property_v2.cc
│   │   │   ├── dnnl/
│   │   │   │   ├── dnnl_batch_dot.cc
│   │   │   │   ├── dnnl_batch_dot_property.h
│   │   │   │   ├── dnnl_bn_relu.cc
│   │   │   │   ├── dnnl_bn_relu_property.h
│   │   │   │   ├── dnnl_common.h
│   │   │   │   ├── dnnl_conv-inl.h
│   │   │   │   ├── dnnl_conv.cc
│   │   │   │   ├── dnnl_conv_property.h
│   │   │   │   ├── dnnl_fc-inl.h
│   │   │   │   ├── dnnl_fc.cc
│   │   │   │   ├── dnnl_fc_property.h
│   │   │   │   ├── dnnl_fc_sum_fuse_property.h
│   │   │   │   ├── dnnl_identity_property.h
│   │   │   │   ├── dnnl_post_amp_property.h
│   │   │   │   ├── dnnl_post_quantize_align_scale_property.h
│   │   │   │   ├── dnnl_post_quantize_property.h
│   │   │   │   ├── dnnl_pow_mul_scalar.cc
│   │   │   │   ├── dnnl_pow_mul_scalar_property.h
│   │   │   │   ├── dnnl_remove_casts_property.h
│   │   │   │   ├── dnnl_subgraph_base-inl.h
│   │   │   │   ├── dnnl_subgraph_property.cc
│   │   │   │   ├── dnnl_transformer-inl.h
│   │   │   │   ├── dnnl_transformer.cc
│   │   │   │   ├── dnnl_transformer_qk_common.h
│   │   │   │   ├── dnnl_transformer_qk_property.h
│   │   │   │   └── dnnl_transformer_valatt_property.h
│   │   │   ├── eliminate_common_nodes_pass.cc
│   │   │   ├── partitioner/
│   │   │   │   └── custom_subgraph_property.h
│   │   │   ├── static_shape_subgraph_property.cc
│   │   │   ├── subgraph_property.h
│   │   │   └── tensorrt/
│   │   │       ├── nnvm_to_onnx-inl.h
│   │   │       ├── nnvm_to_onnx.cc
│   │   │       ├── onnx_to_tensorrt.cc
│   │   │       ├── onnx_to_tensorrt.h
│   │   │       ├── tensorrt-inl.h
│   │   │       ├── tensorrt.cc
│   │   │       └── tensorrt.cu
│   │   ├── subgraph_op_common.cc
│   │   ├── subgraph_op_common.h
│   │   ├── svm_output-inl.h
│   │   ├── svm_output.cc
│   │   ├── svm_output.cu
│   │   ├── swapaxis-inl.h
│   │   ├── swapaxis.cc
│   │   ├── swapaxis.cu
│   │   ├── tensor/
│   │   │   ├── amp_cast.cc
│   │   │   ├── amp_cast.cu
│   │   │   ├── amp_cast.h
│   │   │   ├── broadcast_reduce-inl.h
│   │   │   ├── broadcast_reduce_minmax_value.cc
│   │   │   ├── broadcast_reduce_minmax_value.cu
│   │   │   ├── broadcast_reduce_norm_value.cc
│   │   │   ├── broadcast_reduce_norm_value.cu
│   │   │   ├── broadcast_reduce_op.cc
│   │   │   ├── broadcast_reduce_op.h
│   │   │   ├── broadcast_reduce_op_index.cc
│   │   │   ├── broadcast_reduce_op_index.cu
│   │   │   ├── broadcast_reduce_op_value.cc
│   │   │   ├── broadcast_reduce_op_value.cu
│   │   │   ├── broadcast_reduce_prod_value.cc
│   │   │   ├── broadcast_reduce_prod_value.cu
│   │   │   ├── broadcast_reduce_sum_value.cc
│   │   │   ├── broadcast_reduce_sum_value.cu
│   │   │   ├── cast_storage-inl.cuh
│   │   │   ├── cast_storage-inl.h
│   │   │   ├── cast_storage.cc
│   │   │   ├── cast_storage.cu
│   │   │   ├── control_flow_op.cc
│   │   │   ├── control_flow_op.cu
│   │   │   ├── control_flow_op.h
│   │   │   ├── diag_op-inl.h
│   │   │   ├── diag_op.cc
│   │   │   ├── diag_op.cu
│   │   │   ├── dot-inl.cuh
│   │   │   ├── dot-inl.h
│   │   │   ├── dot.cc
│   │   │   ├── dot.cu
│   │   │   ├── elemwise_binary_broadcast_op.cc
│   │   │   ├── elemwise_binary_broadcast_op.h
│   │   │   ├── elemwise_binary_broadcast_op_basic.cc
│   │   │   ├── elemwise_binary_broadcast_op_basic.cu
│   │   │   ├── elemwise_binary_broadcast_op_extended.cc
│   │   │   ├── elemwise_binary_broadcast_op_extended.cu
│   │   │   ├── elemwise_binary_broadcast_op_logic.cc
│   │   │   ├── elemwise_binary_broadcast_op_logic.cu
│   │   │   ├── elemwise_binary_op-inl.h
│   │   │   ├── elemwise_binary_op.cc
│   │   │   ├── elemwise_binary_op.h
│   │   │   ├── elemwise_binary_op_basic.cc
│   │   │   ├── elemwise_binary_op_basic.cu
│   │   │   ├── elemwise_binary_op_extended.cc
│   │   │   ├── elemwise_binary_op_extended.cu
│   │   │   ├── elemwise_binary_op_logic.cc
│   │   │   ├── elemwise_binary_op_logic.cu
│   │   │   ├── elemwise_binary_scalar_op.cc
│   │   │   ├── elemwise_binary_scalar_op.h
│   │   │   ├── elemwise_binary_scalar_op_basic.cc
│   │   │   ├── elemwise_binary_scalar_op_basic.cu
│   │   │   ├── elemwise_binary_scalar_op_extended.cc
│   │   │   ├── elemwise_binary_scalar_op_extended.cu
│   │   │   ├── elemwise_binary_scalar_op_logic.cc
│   │   │   ├── elemwise_binary_scalar_op_logic.cu
│   │   │   ├── elemwise_sum.cc
│   │   │   ├── elemwise_sum.cu
│   │   │   ├── elemwise_sum.h
│   │   │   ├── elemwise_unary_op.cc
│   │   │   ├── elemwise_unary_op.h
│   │   │   ├── elemwise_unary_op_basic.cc
│   │   │   ├── elemwise_unary_op_basic.cu
│   │   │   ├── elemwise_unary_op_logexp.cc
│   │   │   ├── elemwise_unary_op_logexp.cu
│   │   │   ├── elemwise_unary_op_pow.cc
│   │   │   ├── elemwise_unary_op_pow.cu
│   │   │   ├── elemwise_unary_op_trig.cc
│   │   │   ├── elemwise_unary_op_trig.cu
│   │   │   ├── histogram-inl.h
│   │   │   ├── histogram.cc
│   │   │   ├── histogram.cu
│   │   │   ├── index_add-inl.h
│   │   │   ├── index_add_backward.cc
│   │   │   ├── index_add_backward.cu
│   │   │   ├── index_add_forward.cc
│   │   │   ├── index_add_forward.cu
│   │   │   ├── index_update-inl.h
│   │   │   ├── index_update.cc
│   │   │   ├── index_update.cu
│   │   │   ├── indexing_op-inl.cuh
│   │   │   ├── indexing_op.cc
│   │   │   ├── indexing_op.cu
│   │   │   ├── indexing_op.h
│   │   │   ├── init_op.cc
│   │   │   ├── init_op.cu
│   │   │   ├── init_op.h
│   │   │   ├── la_op-inl.h
│   │   │   ├── la_op.cc
│   │   │   ├── la_op.cu
│   │   │   ├── la_op.h
│   │   │   ├── matrix_op-inl.h
│   │   │   ├── matrix_op.cc
│   │   │   ├── matrix_op.cu
│   │   │   ├── ordering_op-inl.h
│   │   │   ├── ordering_op.cc
│   │   │   ├── ordering_op.cu
│   │   │   ├── pseudo2DTranspose_op-inl.cuh
│   │   │   ├── ravel.cc
│   │   │   ├── ravel.cu
│   │   │   ├── ravel.h
│   │   │   ├── reduce_rtc.cc
│   │   │   ├── slice-inl.h
│   │   │   ├── sort_op-inl.cuh
│   │   │   ├── sort_op.h
│   │   │   ├── sparse_retain-inl.h
│   │   │   ├── sparse_retain.cc
│   │   │   ├── sparse_retain.cu
│   │   │   ├── square_sum-inl.h
│   │   │   ├── square_sum.cc
│   │   │   ├── square_sum.cu
│   │   │   └── util/
│   │   │       ├── tensor_util-inl.cuh
│   │   │       └── tensor_util-inl.h
│   │   └── tvmop/
│   │       ├── op_module.cc
│   │       └── op_module.h
│   ├── optimizer/
│   │   └── sgd-inl.h
│   ├── profiler/
│   │   ├── aggregate_stats.cc
│   │   ├── aggregate_stats.h
│   │   ├── custom_op_profiler.h
│   │   ├── profiler.cc
│   │   ├── profiler.h
│   │   ├── storage_profiler.cc
│   │   ├── storage_profiler.h
│   │   ├── vtune.cc
│   │   └── vtune.h
│   ├── resource.cc
│   ├── runtime/
│   │   ├── c_runtime_api.cc
│   │   ├── container.cc
│   │   ├── ndarray_handle.cc
│   │   ├── object.cc
│   │   ├── object_internal.h
│   │   └── registry.cc
│   ├── serialization/
│   │   ├── cnpy.cc
│   │   └── cnpy.h
│   └── storage/
│       ├── cpu_device_storage.h
│       ├── cpu_shared_storage_manager.h
│       ├── gpu_device_storage.h
│       ├── naive_storage_manager.h
│       ├── pinned_memory_storage.h
│       ├── pooled_storage_manager.h
│       ├── storage.cc
│       ├── storage_manager.h
│       └── storage_manager_helpers.h
├── tests/
│   ├── .gitignore
│   ├── CMakeLists.txt
│   ├── README.md
│   ├── cpp/
│   │   ├── .gitignore
│   │   ├── engine/
│   │   │   ├── engine_shutdown_test.cc
│   │   │   ├── omp_test.cc
│   │   │   ├── thread_local_test.cc
│   │   │   └── threaded_engine_test.cc
│   │   ├── include/
│   │   │   ├── test_core_op.h
│   │   │   ├── test_dnnl.h
│   │   │   ├── test_legacy_op.h
│   │   │   ├── test_ndarray_utils.h
│   │   │   ├── test_op.h
│   │   │   ├── test_op_runner.h
│   │   │   ├── test_perf.h
│   │   │   ├── test_tune.h
│   │   │   └── test_util.h
│   │   ├── kvstore/
│   │   │   └── gpu_topology_test.cc
│   │   ├── misc/
│   │   │   ├── base.cc
│   │   │   └── libinfo_test.cc
│   │   ├── operator/
│   │   │   ├── activation_perf.cc
│   │   │   ├── batchnorm_test.cc
│   │   │   ├── coreop_perf.cc
│   │   │   ├── dnnl_operator_test.cc
│   │   │   ├── dnnl_test.cc
│   │   │   ├── dropout_perf.cc
│   │   │   ├── fully_conn_perf.cc
│   │   │   ├── krprod_test.cc
│   │   │   ├── runner/
│   │   │   │   └── core_op_runner_test.cc
│   │   │   ├── slice_channel_perf.cc
│   │   │   └── tune/
│   │   │       └── operator_tune_test.cc
│   │   ├── storage/
│   │   │   └── storage_test.cc
│   │   └── test_main.cc
│   ├── nightly/
│   │   ├── .gitignore
│   │   ├── Jenkinsfile
│   │   ├── JenkinsfileForBinaries
│   │   ├── README.md
│   │   ├── TestDoc/
│   │   │   ├── doc_spell_checker.py
│   │   │   └── doc_spell_grammar.sh
│   │   ├── common.py
│   │   ├── dist_async_kvstore.py
│   │   ├── dist_device_sync_kvstore.py
│   │   ├── dist_device_sync_kvstore_byteps.py
│   │   ├── dist_device_sync_kvstore_custom.py
│   │   ├── dist_device_sync_kvstore_horovod.py
│   │   ├── dist_sync_kvstore.py
│   │   ├── estimator/
│   │   │   ├── test_estimator_cnn.py
│   │   │   └── test_sentiment_rnn.py
│   │   ├── model_backwards_compatibility_check/
│   │   │   ├── JenkinsfileForMBCC
│   │   │   ├── README.md
│   │   │   ├── __init__.py
│   │   │   ├── common.py
│   │   │   ├── model_backward_compat_checker.sh
│   │   │   ├── model_backwards_compat_inference.py
│   │   │   ├── model_backwards_compat_train.py
│   │   │   ├── train_mxnet_legacy_models.sh
│   │   │   └── upload_models_to_s3.sh
│   │   ├── test_distributed_training-gpu.sh
│   │   ├── test_kvstore.py
│   │   ├── test_large_array.py
│   │   ├── test_large_vector.py
│   │   ├── test_np_large_array.py
│   │   ├── test_np_random.py
│   │   └── test_server_profiling.py
│   ├── python/
│   │   ├── README.md
│   │   ├── amp/
│   │   │   └── common.py
│   │   ├── array-api/
│   │   │   └── test_data_interchange.py
│   │   ├── common/
│   │   │   └── models.py
│   │   ├── conftest.py
│   │   ├── dnnl/
│   │   │   ├── op_cfg.py
│   │   │   ├── subgraphs/
│   │   │   │   ├── subgraph_common.py
│   │   │   │   ├── test_amp_subgraph.py
│   │   │   │   ├── test_conv_subgraph.py
│   │   │   │   ├── test_fc_subgraph.py
│   │   │   │   ├── test_matmul_subgraph.py
│   │   │   │   └── test_pow_mul_subgraph.py
│   │   │   ├── test_amp.py
│   │   │   ├── test_bf16_operator.py
│   │   │   ├── test_dnnl.py
│   │   │   └── test_quantization_dnnl.py
│   │   ├── doctest/
│   │   │   └── test_docstring.py
│   │   ├── gpu/
│   │   │   ├── test_amp.py
│   │   │   ├── test_amp_init.py
│   │   │   ├── test_deferred_compute_gpu.py
│   │   │   ├── test_device.py
│   │   │   ├── test_extensions_gpu.py
│   │   │   ├── test_fusion.py
│   │   │   ├── test_gluon_gpu.py
│   │   │   ├── test_gluon_model_zoo_gpu.py
│   │   │   ├── test_gluon_transforms.py
│   │   │   ├── test_kvstore_gpu.py
│   │   │   ├── test_nccl.py
│   │   │   ├── test_numpy_einsum.py
│   │   │   ├── test_numpy_fallback.py
│   │   │   ├── test_operator_gpu.py
│   │   │   ├── test_profiler_gpu.py
│   │   │   ├── test_rtc.py
│   │   │   ├── test_tvm_bridge.py
│   │   │   └── test_tvm_op_gpu.py
│   │   ├── onnx/
│   │   │   ├── test_models.py
│   │   │   └── test_operators.py
│   │   ├── profiling/
│   │   │   ├── simple_forward.py
│   │   │   └── test_nvtx.py
│   │   ├── quantization/
│   │   │   └── test_quantization.py
│   │   ├── test_quantization_gpu.py
│   │   ├── train/
│   │   │   ├── common.py
│   │   │   └── test_autograd.py
│   │   └── unittest/
│   │       ├── common.py
│   │       ├── legacy_ndarray.v0
│   │       ├── test_attr.py
│   │       ├── test_autograd.py
│   │       ├── test_base.py
│   │       ├── test_contrib_control_flow.py
│   │       ├── test_contrib_gluon_data_vision.py
│   │       ├── test_contrib_hawkesll.py
│   │       ├── test_contrib_intgemm.py
│   │       ├── test_contrib_io.py
│   │       ├── test_contrib_krprod.py
│   │       ├── test_contrib_operator.py
│   │       ├── test_contrib_optimizer.py
│   │       ├── test_contrib_stes_op.py
│   │       ├── test_deferred_compute.py
│   │       ├── test_dgl_graph.py
│   │       ├── test_dynamic_shape.py
│   │       ├── test_engine.py
│   │       ├── test_engine_import.py
│   │       ├── test_exc_handling.py
│   │       ├── test_executor.py
│   │       ├── test_extensions.py
│   │       ├── test_ffi_container.py
│   │       ├── test_gluon.py
│   │       ├── test_gluon_batch_processor.py
│   │       ├── test_gluon_control_flow.py
│   │       ├── test_gluon_data.py
│   │       ├── test_gluon_estimator.py
│   │       ├── test_gluon_event_handler.py
│   │       ├── test_gluon_indexing.py
│   │       ├── test_gluon_model_zoo.py
│   │       ├── test_gluon_probability_v2.py
│   │       ├── test_gluon_rnn.py
│   │       ├── test_gluon_save.py
│   │       ├── test_gluon_trainer.py
│   │       ├── test_gluon_utils.py
│   │       ├── test_higher_order_grad.py
│   │       ├── test_image.py
│   │       ├── test_infer_shape.py
│   │       ├── test_infer_type.py
│   │       ├── test_io.py
│   │       ├── test_kvstore.py
│   │       ├── test_kvstore_custom.py
│   │       ├── test_loss.py
│   │       ├── test_memory_opt.py
│   │       ├── test_metric.py
│   │       ├── test_ndarray.py
│   │       ├── test_numpy_contrib_gluon_data_vision.py
│   │       ├── test_numpy_default_dtype.py
│   │       ├── test_numpy_gluon.py
│   │       ├── test_numpy_gluon_data_vision.py
│   │       ├── test_numpy_interoperability.py
│   │       ├── test_numpy_loss.py
│   │       ├── test_numpy_ndarray.py
│   │       ├── test_numpy_op.py
│   │       ├── test_operator.py
│   │       ├── test_optimizer.py
│   │       ├── test_profiler.py
│   │       ├── test_random.py
│   │       ├── test_recordio.py
│   │       ├── test_runtime.py
│   │       ├── test_smoke.py
│   │       ├── test_sparse_ndarray.py
│   │       ├── test_sparse_operator.py
│   │       ├── test_subgraph.py
│   │       ├── test_subgraph_op.py
│   │       ├── test_symbol.py
│   │       ├── test_test_utils.py
│   │       ├── test_thread_local.py
│   │       ├── test_tvm_op.py
│   │       └── test_viz.py
│   ├── tutorials/
│   │   ├── test_sanity_tutorials.py
│   │   └── test_tutorials.py
│   └── utils/
│       └── notebook_test/
│           └── __init__.py
└── tools/
    ├── bandwidth/
    │   ├── .gitignore
    │   ├── README.md
    │   ├── measure.py
    │   └── test_measure.py
    ├── cfn/
    │   └── Readme.md
    ├── create_source_archive.sh
    ├── dependencies/
    │   ├── LICENSE.binary.dependencies
    │   ├── README.md
    │   ├── cityhash.sh
    │   ├── curl.sh
    │   ├── eigen.sh
    │   ├── libpng.sh
    │   ├── libtiff.sh
    │   ├── libturbojpeg.sh
    │   ├── libz.sh
    │   ├── lz4.sh
    │   ├── make_shared_dependencies.sh
    │   ├── mkl.sh
    │   ├── numpy_mkl.sh
    │   ├── openblas.sh
    │   ├── opencv.sh
    │   ├── openssl.sh
    │   ├── patch/
    │   │   └── opencv_lapack.h
    │   ├── protobuf.sh
    │   └── zmq.sh
    ├── diagnose.py
    ├── flakiness_checker.py
    ├── git-pre-commit
    ├── im2rec.cc
    ├── im2rec.py
    ├── ipynb2md.py
    ├── kill-mxnet.py
    ├── launch.py
    ├── license_header.py
    ├── lint/
    │   ├── clang_format_ci.sh
    │   └── git-clang-format-13
    ├── parse_log.py
    ├── pip/
    │   ├── MANIFEST.in
    │   ├── doc/
    │   │   ├── CPU_ADDITIONAL.md
    │   │   ├── CU101_ADDITIONAL.md
    │   │   ├── CU102_ADDITIONAL.md
    │   │   ├── CU110_ADDITIONAL.md
    │   │   ├── CU112_ADDITIONAL.md
    │   │   ├── NATIVE_ADDITIONAL.md
    │   │   └── PYPI_README.md
    │   ├── sanity_test.py
    │   └── setup.py
    ├── profile/
    │   └── tune_python.sh
    ├── rec2idx.py
    ├── staticbuild/
    │   ├── README.md
    │   ├── build.sh
    │   ├── build_lib.sh
    │   └── build_wheel.sh
    └── windowsbuild/
        ├── README.md
        ├── gen_warp.cpp
        └── warp_dll.cpp

================================================
FILE CONTENTS
================================================

================================================
FILE: .asf.yaml
================================================
notifications:
    commits:      commits@mxnet.apache.org
    issues:       issues@mxnet.apache.org
    pullrequests: commits@mxnet.apache.org

github:
  features:
    wiki: true
    issues: true
    projects: true

  enabled_merge_buttons:
    squash:  true
    merge:   false
    rebase:  true


================================================
FILE: .clang-format
================================================
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements.  See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership.  The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License.  You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.

---
Language: Cpp
BasedOnStyle: Google
ColumnLimit: 100
AlignConsecutiveAssignments: true
AlignConsecutiveDeclarations: false
AlignConsecutiveMacros: true
DerivePointerAlignment: false
SortIncludes: true
MaxEmptyLinesToKeep: 1
PointerAlignment: Left
AllowAllParametersOfDeclarationOnNextLine: false
AllowShortBlocksOnASingleLine: false
AllowShortCaseLabelsOnASingleLine: false
AllowShortFunctionsOnASingleLine: Empty
AllowShortIfStatementsOnASingleLine: false
AllowShortLoopsOnASingleLine: false
AlwaysBreakAfterReturnType: None
AlwaysBreakBeforeMultilineStrings: true
AlwaysBreakTemplateDeclarations: true
BinPackArguments: false
BinPackParameters: false
SortIncludes: false
BreakBeforeTernaryOperators: false
---
Language: JavaScript
DisableFormat: true


================================================
FILE: .clang-tidy
================================================
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements.  See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership.  The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License.  You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.

# The checks defined here will be run and will display by default as warnings.
Checks: >
    -*, cppcoreguidelines-* clang-analyzer-*, modernize-*,
    performance-faster-string-find, performance-for-range-copy,
    performance-implicit-conversion-in-loop, performance-inefficient-algorithm,
    performance-inefficient-string-concatenation, performance-trivially-destructible,
    performance-inefficient-vector-operation, performance-move-const-arg,
    performance-move-constructor-init, performance-noexcept-move-constructor,
    performance-no-automatic-move, performance-unnecessary-copy-initialization,
    performance-type-promotion-in-math-fn

# performance checks not enabled due to segmentation fault in clang-tidy v8+:
# performance-unnecessary-value-param

# In order to trigger an error, you must have a rule defined both in checks and in this section.
WarningsAsErrors: >
    cppcoreguidelines-no-malloc, modernize-deprecated-headers,
    modernize-loop-convert, modernize-make-shared, modernize-pass-by-value, modernize-make-unique,
    modernize-raw-string-literal, modernize-redundant-void-arg, modernize-replace-auto-ptr,
    modernize-replace-random-shuffle, modernize-return-braced-init-list, modernize-shrink-to-fit,
    modernize-unary-static-assert, modernize-use-bool-literals, modernize-use-default-member-init,
    modernize-use-emplace, modernize-use-equals-default, modernize-use-equals-delete,
    modernize-use-noexcept, modernize-use-nullptr, modernize-use-override,
    modernize-use-transparent-functors, modernize-use-using,
    performance-faster-string-find, performance-implicit-conversion-in-loop,
    performance-inefficient-algorithm, performance-inefficient-string-concatenation,
    performance-trivially-destructible, performance-inefficient-vector-operation,
    performance-move-const-arg, performance-move-constructor-init,
    performance-noexcept-move-constructor, performance-no-automatic-move,
    performance-unnecessary-copy-initialization, performance-type-promotion-in-math-fn

# modernize checks not enforced:
# modernize-use-auto
# modernize-avoid-bind

# performance checks not enforced due to segmentation fault
# performance-for-range-copy

# Todo: define a better regex match that includes most project headers, but excludes third party
# code.
HeaderFilterRegex: '^src/.*'


================================================
FILE: .cmakelintrc
================================================
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements.  See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership.  The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License.  You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.
#
# build and install are separated so changes to build don't invalidate
# the whole docker cache for the image

# --filter= options: https://pypi.org/project/cmakelint/
# "-" disable option
# "+" enable option
filter=-convention/filename,-linelength,-package/consistency,-readability/logic,-readability/mixedcase,-readability/wonkycase,-syntax,-whitespace/eol,+whitespace/extra,-whitespace/indent,-whitespace/mismatch,-whitespace/newline,-whitespace/tabs


================================================
FILE: .codecov.yml
================================================
# Codecov.io configuration file
# See https://docs.codecov.io/docs/codecovyml-reference
codecov:
  notify:
    require_ci_to_pass: yes

coverage:
  status:
    project: off
    patch: on
  precision: 2
  round: down
  range: "70...100"

parsers:
  gcov:
    branch_detection:
      conditional: yes
      loop: yes
      method: no
      macro: no

ignore:
 - "tests/**/*"

# Disable comments for now to gather data in the background
comment: false
#  layout: "header, diff"
#  behavior: default
#  require_changes: no


================================================
FILE: .git-blame-ignore-revs
================================================
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements.  See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership.  The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License.  You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.

# Clang-formatter initial commit - /src directory is formatted
e359bcd65e453d4bc86d3d8e5b1dee3916a2e426

# Clang-formatter initial commit - OneDNN files
718a860f3aa8f24acca2aec867a3b31bc60a6e79


================================================
FILE: .gitattributes
================================================
.gitattributes export-ignore
R-package/* export-ignore


================================================
FILE: .github/ISSUE_TEMPLATE/bug_report.md
================================================
---
name: Bug report
about: Create a report to help us improve
title: ''
labels: 'Bug, needs triage'
assignees: ''

---
## Description
(A clear and concise description of what the bug is.)

### Error Message
(Paste the complete error message. Please also include stack trace by setting environment variable `DMLC_LOG_STACK_TRACE_DEPTH=100` before running your script.)

## To Reproduce
(If you developed your own code, please provide a short script that reproduces the error. For existing examples, please provide link.)

### Steps to reproduce
(Paste the commands you ran that produced the error.)

1.
2.

## What have you tried to solve it?

1.
2.

## Environment

***We recommend using our script for collecting the diagnostic information with the following command***
`curl --retry 10 -s https://raw.githubusercontent.com/apache/incubator-mxnet/master/tools/diagnose.py | python3`

<details>
<summary>Environment Information</summary>

```
# Paste the diagnose.py command output here
```

</details>


================================================
FILE: .github/ISSUE_TEMPLATE/config.yml
================================================
blank_issues_enabled: false
contact_links:
  - name: GitHub Discussions
    url: https://github.com/apache/mxnet/discussions
    about: Use GitHub Discussions to ask and answer questions, exchange ideas, and share learning.
  - name: Discourse Forum
    url: https://discuss.mxnet.io/
    about: Discuss forum for usage questions.
  - name: Stack Overflow
    url: https://stackoverflow.com/questions/tagged/mxnet
    about: Ask and answer usage questions on Stack Overflow


================================================
FILE: .github/ISSUE_TEMPLATE/feature_request.md
================================================
---
name: Feature request
about: Suggest an idea for this project
title: ''
labels: 'Feature request'
assignees: ''

---

## Description
(A clear and concise description of what the feature is.)
- If the proposal is about a new model, provide description of what the model is.
- If the proposal is about an API, provide mock examples if possible.

## References
- list reference and related literature
- list known implementations


================================================
FILE: .github/ISSUE_TEMPLATE/flaky_test.md
================================================
---
name: Flaky test
about: Report a flaky test
title: ''
labels: 'Flaky'
assignees: ''

---
## Description
(The location and name of the flaky test.)

## Occurrences
(Links to the known occurrences.)

## What have you tried to solve it?

1.
2.


================================================
FILE: .github/ISSUE_TEMPLATE/rfc.md
================================================
---
name: Request for comment (RFC)
about: RFC process requests for review on the design of a new feature or bug fix that involves more efforts. This thread is automatically mirrored to the dev@mxnet.apache.org mailing list.
title: '[RFC] '
labels: 'RFC'
assignees: ''

---

## Problem statement
(A clear and concise description of what this contribution is trying to solve.)

## Proposed solutions
(Description of the approach this contribution takes to solve the problem.)

## References
- list reference and related literature
- list known implementations


================================================
FILE: .github/PULL_REQUEST_TEMPLATE.md
================================================
## Description ##
(Brief description on what this PR is about)

## Checklist ##
### Essentials ###
- [ ] PR's title starts with a category (e.g. [BUGFIX], [MODEL], [TUTORIAL], [FEATURE], [DOC], etc)
- [ ] Changes are complete (i.e. I finished coding on this PR)
- [ ] All changes have test coverage
- [ ] Code is well-documented

### Changes ###
- [ ] Feature1, tests, (and when applicable, API doc)
- [ ] Feature2, tests, (and when applicable, API doc)

## Comments ##
- If this change is a backward incompatible change, why must this change be made.
- Interesting edge cases to note here


================================================
FILE: .github/workflows/greetings.yml
================================================
name: Greetings

on: [pull_request, issues]

jobs:
  greeting:
    runs-on: ubuntu-latest
    steps:
    - uses: actions/first-interaction@v1
      env:
        GITHUB_PR_BASE_SHA: ${{ github.event.pull_request.base.sha }}
        GITHUB_PR_RUN_ID: ${{ github.run_id }}
        GITHUB_PR_BASE_REF: ${{ github.event.pull_request.base.ref }} 
      with:
        repo-token: ${{ secrets.GITHUB_TOKEN }}
        issue-message: |
          Welcome to Apache MXNet (incubating)! We are on a mission to democratize AI, and we are glad that you are contributing to it by opening this issue.
          Please make sure to include all the relevant context, and one of the @apache/mxnet-committers will be here shortly.
          If you are interested in contributing to our project, let us know! Also, be sure to check out our guide on [contributing to MXNet](https://mxnet.apache.org/community/contribute) and our [development guides wiki](https://cwiki.apache.org/confluence/display/MXNET/Developments).
        pr-message: |
          Welcome to Apache MXNet (incubating)! We are on a mission to democratize AI, and we are glad that you are contributing to it by opening this pull request.
          Please make sure that the changes are covered by tests. One of the @apache/mxnet-committers will be here shortly.
          If you run into any issue with the CI and tests, we recommend that you first check out our guide on [developer guides wiki](https://cwiki.apache.org/confluence/display/MXNET/Developments).
          Let our @apache/mxnet-committers know if you need any help!


================================================
FILE: .github/workflows/license_check.yml
================================================
name: license check

on: [push, pull_request]

defaults:
  run:
    shell: bash

jobs:
  licensecheck:
    runs-on: ubuntu-latest
    strategy:
      fail-fast: false
    steps:
      - name: Checkout repository
        uses: actions/checkout@v2

      - name: Update Submodules
        run: |
          git submodule update --init --recursive

      - name: Check License Header
        uses: apache/skywalking-eyes@main
        env:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}


================================================
FILE: .github/workflows/link_check.yml
================================================
name: link check

on: [push, pull_request]

defaults:
  run:
    shell: bash

jobs:
  linkcheck:
    runs-on: ubuntu-20.04
    strategy:
      fail-fast: false
    steps:
      - name: Checkout repository
        uses: actions/checkout@v2

      - name: Compilation cache
        uses: actions/cache@v2
        with:
          path: ~/.ccache
          # We include the commit sha in the cache key, as new cache entries are
          # only created if there is no existing entry for the key yet.
          key: ${{ runner.os }}-ccache-${{ github.sha }}
          # Restore any ccache cache entry, if none for
          # ${{ runner.os }}-ccache-${{ github.sha }} exists
          restore-keys: |
            ${{ runner.os }}-ccache

      - name: Setup python
        uses: actions/setup-python@v2
        with:
          python-version: '3.8'
          architecture: x64

      - name: Install Dependencies
        run: |
          sudo apt-get update
          sudo apt-get install -y libopenblas-dev ninja-build ccache python3-sphinx \
              pandoc gcc-7 g++-7 libopencv-dev protobuf-compiler libprotobuf-dev
          ccache -M 500M  # Limit the ccache size; Github's overall cache limit is 5GB
          python -m pip install pandoc-attributes==0.1.7
          python -m pip install -r docs/python_docs/requirements
          python -m pip install docs/python_docs/themes/mx-theme
        shell: bash

      - name: Build project
        env:
          CC: gcc-7
          CXX: g++-7
        run: |
          git submodule update --init --recursive
          mkdir build; cd build
          CXXFLAGS="-Wno-error=strict-overflow" cmake \
              -DCMAKE_BUILD_TYPE="RelWithDebInfo" \
              -DUSE_ONEDNN=OFF \
              -DUSE_CUDA=OFF \
              -G Ninja ..
          ninja
          cd ..
        shell: bash

      - name: Setup Python
        run: |
          python -m pip install --user -e python

      - name: Link Check
        env:
          MAX_RETRY: 3
        run: |
          for run in {1..$MAX_RETRY}
          do
            cd docs/python_docs/python
            make clean
            timeout 10m make linkcheck EVAL=0
            if [[ $? -eq 0 ]]
            then
              break
            else
              if [[ run -eq $MAX_RETRY ]]
              then
                exit 1
              fi
            fi
          done


================================================
FILE: .github/workflows/os_x_mklbuild.yml
================================================
name: mkl continuous build

on: [push, pull_request]

jobs:
  macosx-x86_64:
    runs-on: macos-10.15
    steps:
      - name: Checkout repository
        uses: actions/checkout@v2

      - name: Compilation cache
        uses: actions/cache@v2
        with:
          path: ~/.ccache
          # We include the commit sha in the cache key, as new cache entries are
          # only created if there is no existing entry for the key yet.
          key: ${{ runner.os }}-ccache-${{ github.sha }}
          # Restore any ccache cache entry, if none for
          # ${{ runner.os }}-ccache-${{ github.sha }} exists
          restore-keys: |
            ${{ runner.os }}-ccache

      - name: Setup python
        uses: actions/setup-python@v2
        with:
          python-version: '3.8'
          architecture: x64

      - name: Install Dependencies
        run: |
          brew install nasm automake ninja libtool cmake pkgconfig protobuf hdf5 zlib ccache
          ccache -M 500M  # Limit the ccache size; Github's overall cache limit is 5GB
          python -m pip install -r ci/docker/install/requirements
        shell: bash

      - name: Build project
        run: |
          ./tools/staticbuild/build.sh cpu mkl

      - name: Setup Python
        run: |
          python -m pip install --user -e python

      - name: Test project
        run: |
          python -m pytest -n 4 --durations=50 --verbose tests/python/unittest/ -k 'not test_operator and not (test_subgraph or test_custom_op or test_external_op or test_recordimage_dataset_with_data_loader_multiworker or test_multi_worker or test_multi_worker_shape or test_multi_worker_forked_data_loader or test_multi_worker_dataloader_release_pool)' -m 'not serial'
          MXNET_ENGINE_TYPE=NaiveEngine python -m pytest -n 4 --durations=50 --verbose tests/python/unittest/ -k 'test_operator and not (test_subgraph or test_custom_op or test_external_op or test_recordimage_dataset_with_data_loader_multiworker or test_multi_worker or test_multi_worker_shape or test_multi_worker_forked_data_loader or test_multi_worker_dataloader_release_pool)' -m 'not serial'
          python -m pytest --durations=50 --verbose tests/python/unittest/ -k 'not (test_subgraph or test_custom_op or test_external_op or test_recordimage_dataset_with_data_loader_multiworker or test_multi_worker or test_multi_worker_shape or test_multi_worker_forked_data_loader or test_multi_worker_dataloader_release_pool)' -m 'serial'
          python -m pytest -n 4 --durations=50 --verbose tests/python/dnnl -k 'not (test_bf16_operator or test_amp or test_amp_subgraph)'


================================================
FILE: .github/workflows/os_x_staticbuild.yml
================================================
name: continuous build

on: [push, pull_request]

jobs:
  macosx-x86_64:
    runs-on: macos-latest
    steps:
      - name: Checkout repository
        uses: actions/checkout@v2

      - name: Compilation cache
        uses: actions/cache@v2
        with:
          path: ~/.ccache
          # We include the commit sha in the cache key, as new cache entries are
          # only created if there is no existing entry for the key yet.
          key: ${{ runner.os }}-ccache-${{ github.sha }}
          # Restore any ccache cache entry, if none for
          # ${{ runner.os }}-ccache-${{ github.sha }} exists
          restore-keys: |
            ${{ runner.os }}-ccache

      - name: Setup python
        uses: actions/setup-python@v2
        with:
          python-version: '3.8'
          architecture: x64

      - name: Install Dependencies
        run: |
          brew install nasm automake ninja libtool cmake pkgconfig protobuf hdf5 zlib ccache
          ccache -M 500M  # Limit the ccache size; Github's overall cache limit is 5GB
          python -m pip install -r ci/docker/install/requirements
        shell: bash

      - name: Build project
        run: |
          CMAKE_STATICBUILD=1 ./tools/staticbuild/build.sh cpu

      - name: Setup Python
        run: |
          python -m pip install --user -e python

      - name: Build with Cython
        run: |
          cd python
          python setup.py build_ext --inplace --with-cython

      - name: Test project
        env:
          MXNET_ENABLE_CYTHON: 1
        run: |
          python3 -m pytest -n 4 --durations=50 --verbose tests/python/unittest/ -k 'not test_operator and not (test_subgraph or test_custom_op or test_external_op or test_recordimage_dataset_with_data_loader_multiworker or test_multi_worker or test_multi_worker_shape or test_multi_worker_forked_data_loader or test_multi_worker_dataloader_release_pool)' -m 'not serial'
          MXNET_ENGINE_TYPE=NaiveEngine python3 -m pytest -n 4 --durations=50 --verbose tests/python/unittest/ -k 'test_operator and not (test_subgraph or test_custom_op or test_external_op or test_recordimage_dataset_with_data_loader_multiworker or test_multi_worker or test_multi_worker_shape or test_multi_worker_forked_data_loader or test_multi_worker_dataloader_release_pool)' -m 'not serial'
          python3 -m pytest --durations=50 --verbose tests/python/unittest/ -k 'not (test_subgraph or test_custom_op or test_external_op or test_recordimage_dataset_with_data_loader_multiworker or test_multi_worker or test_multi_worker_shape or test_multi_worker_forked_data_loader or test_multi_worker_dataloader_release_pool)' -m 'serial'

      - name: Test Array API
        env:
          MXNET_ENABLE_CYTHON: 1
        run: |
          cd ..
          git clone https://github.com/data-apis/array-api-tests.git
          cd array-api-tests
          git checkout c1dba80a196a03f880d2e0a998a272fb3867b720
          export ARRAY_API_TESTS_MODULE=mxnet.numpy pytest
          export DMLC_LOG_STACK_TRACE_DEPTH=100
          python3 -m pytest --reruns 3 --durations=50 --verbose array_api_tests/test_creation_functions.py
          python3 -m pytest --reruns 3 --durations=50 --verbose array_api_tests/test_indexing.py
          python3 -m pytest --reruns 3 --durations=50 --verbose array_api_tests/test_constants.py
          python3 -m pytest --reruns 3 --durations=50 --verbose array_api_tests/test_elementwise_functions.py
          python3 -m pytest --reruns 3 --durations=50 --verbose array_api_tests/test_broadcasting.py
          python3 -m pytest --reruns 3 --durations=50 --verbose \
              array_api_tests/test_type_promotion.py::test_elementwise_function_two_arg_bool_type_promotion
          python3 -m pytest --reruns 3 --durations=50 --verbose \
              array_api_tests/test_type_promotion.py::test_elementwise_function_two_arg_promoted_type_promotion
          python3 -m pytest --reruns 3 --durations=50 --verbose \
              array_api_tests/test_type_promotion.py::test_elementwise_function_one_arg_bool
          python3 -m pytest --reruns 3 --durations=50 --verbose \
              array_api_tests/test_type_promotion.py::test_elementwise_function_one_arg_type_promotion
          python3 -m pytest --reruns 3 --durations=50 --verbose \
              array_api_tests/test_type_promotion.py::test_operator_one_arg_type_promotion
          python3 -m pytest --reruns 3 --durations=50 --verbose \
              array_api_tests/test_type_promotion.py::test_operator_two_arg_bool_promotion
          python3 -m pytest --reruns 3 --durations=50 --verbose \
              array_api_tests/test_type_promotion.py::test_operator_two_arg_promoted_promotion
          python3 -m pytest --reruns 3 --durations=50 --verbose \
              array_api_tests/test_type_promotion.py::test_operator_inplace_two_arg_promoted_promotion


================================================
FILE: .gitignore
================================================
# Compiled Object files
*.slo
*.lo
*.o
*.obj

# Precompiled Headers
*.gch
*.pch

# Compiled Dynamic libraries
*.so
*.dylib
*.dll

# Fortran module files
*.mod

# Compiled Static libraries
*.lai
*.la
*.a
*.lib

# Executables
*.exe
*.out
*.app
*~

# doc
doc/html
doc/latex
doc/doc
docs/web-data
.jekyll-cache
*.lock

#dmlc
config.mk
config.cmake

*.pyc
.Rhistory
*log
Debug
*suo
tracker

# vim
*.swp
*.swo
*.swn
.vimrc
.ycm_extra_conf.py
.ycm_extra_conf.pyc

# Emacs
.#*
.clang_complete
.dir-locals.el
__pycache__
*.pkl
*.params
*.states
*.json
*.d
cmake-build*
data
model
recommonmark

# R
*.Rcheck
*.rds
*.Rproj
.Rproj.user
R-package/inst/*
*.tar.gz
*.tgz
R-package/man/*.Rd
R-package/R/mxnet_generated.R

# data
*.rec
*.lst
*.zip
*ubyte
*.bin
*.txt
!CMakeLists.txt

# ipython notebook
*_pb2.py
*.ipynb_checkpoints*
input.txt*

# Jetbrain
.idea
.gradle
*.iml

# ctags
tags

# cscope
cscope.out
cscope.files

# Eclipse project config
.project
.cproject
.classpath
.settings
.pydevproject
CMakeFiles
cmake_install.cmake

# Visual Studio Code
.vscode

# Mac OS X
.DS_Store

# Windows
windows_package.7z
windows_package

#Notebook Automated Test
!tests/nightly/test_tutorial_config.txt
!tests/nightly/TestNotebook
tests/nightly/tmp_notebook

# pip building tools
tools/pip_package/build
tools/pip_package/dist
tools/pip_package/mxnet.egg-info
tools/pip_package/mxnet

# temporary path for building dependencies when building wheel
deps/
staticdeps/
tmp/
build/
lib/
bin/
model/

# VTune
./r0*hs

# generated function signature for IDE auto-complete
python/mxnet/symbol/gen_*
python/mxnet/ndarray/gen_*
python/.eggs

# tests if built insource
*CTestTestfile.cmake
*DartConfiguration.tcl
tests/Makefile
tests/mxnet_unit_tests

# Code coverage related
.coverage
*.gcov
*.gcno
coverage.xml

# Local CMake build config
cmake_options.yml

# header file generated at compile time
include/onednn/oneapi/dnnl/dnnl_version.h
include/onednn/oneapi/dnnl/dnnl_config.h


================================================
FILE: .gitmodules
================================================
[submodule "3rdparty/dmlc-core"]
	path = 3rdparty/dmlc-core
	url = https://github.com/dmlc/dmlc-core.git
[submodule "3rdparty/ps-lite"]
	path = 3rdparty/ps-lite
	url = https://github.com/dmlc/ps-lite
[submodule "3rdparty/dlpack"]
	path = 3rdparty/dlpack
	url = https://github.com/dmlc/dlpack
[submodule "3rdparty/googletest"]
	path = 3rdparty/googletest
	url = https://github.com/google/googletest.git
[submodule "3rdparty/tvm"]
	path = 3rdparty/tvm
	url = https://github.com/apache/incubator-tvm.git
[submodule "3rdparty/onnx-tensorrt"]
	path = 3rdparty/onnx-tensorrt
	url = https://github.com/onnx/onnx-tensorrt.git
[submodule "3rdparty/nvidia_cub"]
	path = 3rdparty/nvidia_cub
	url = https://github.com/NVlabs/cub.git
[submodule "3rdparty/libzip"]
	path = 3rdparty/libzip
	url = https://github.com/nih-at/libzip.git
[submodule "3rdparty/intgemm"]
	path = 3rdparty/intgemm
	url = https://github.com/kpu/intgemm
[submodule "3rdparty/onednn"]
	path = 3rdparty/onednn
	url = https://github.com/oneapi-src/oneDNN


================================================
FILE: .licenserc.yaml
================================================
header:
  license:
    spdx-id: Apache-2.0
    copyright-owner: Apache Software Foundation

  paths-ignore:
    - 'licenses'
    - 'LICENSE'
    - 'NOTICE'
    - '3rdparty'
    - 'DISCLAIMER'
    - 'KEYS'
    - 'tools/dependencies/LICENSE.binary.dependencies'
    - 'tools/lint/git-clang-format-13'
    # files not distributed in source archive (listed in tools/source-exclude-artifacts.txt)
    - 'docs'
    - 'CODEOWNERS'
    - '.gitignore'
    - '.codecov.yml'
    - '.gitattributes'
    - '.github'
    - '.gitmodules'
    - '.licenserc.yaml'
    - '.asf.yaml'
    - 'CODEOWNERS'
    - 'python/mxnet/_cy3/README.md'
    - 'tools/dependencies/LICENSE.binary.dependencies'
    # files not distributed in source archive (listed in tools/source-exclude-artifacts.txt)
    - 'docs'
    # files licensed under apache-2.0 license but do not include full license headers recognized by skywalking-eyes
    - '**/*.ipynb'
    - 'src/operator/deformable_convolution-inl.h'
    - 'src/operator/deformable_convolution.cc'
    - 'src/operator/deformable_convolution.cu'
    - 'src/operator/contrib/deformable_psroi_pooling-inl.h'
    - 'src/operator/contrib/deformable_psroi_pooling.cc'
    - 'src/operator/contrib/deformable_psroi_pooling.cu'
    - 'src/operator/contrib/multi_proposal-inl.h'
    - 'src/operator/contrib/multi_proposal.cc'
    - 'src/operator/contrib/multi_proposal.cu'
    - 'src/operator/contrib/psroi_pooling.cc'
    - 'src/operator/contrib/psroi_pooling.cu'
    - 'src/operator/nn/dnnl/dnnl_base-inl.h'
    # files licensed under boost license
    - 'cmake/Modules/FindJeMalloc.cmake'
    # files licensed under bsd 2-clause + caffe
    - 'src/operator/nn/pool.cuh'
    - 'src/operator/nn/pool.h'
    - 'src/operator/nn/im2col.cuh'
    - 'src/operator/nn/im2col.h'
    - 'src/operator/contrib/nn/deformable_im2col.cuh'
    - 'src/operator/contrib/nn/deformable_im2col.h'
    - 'src/operator/contrib/nn/modulated_deformable_im2col.cuh'
    - 'src/operator/contrib/nn/modulated_deformable_im2col.h'
    # files licensed under bsd 3-clause
    - 'cmake/upstream/FindBLAS.cmake'
    - 'cmake/upstream/FindCUDAToolkit.cmake'
    - 'cmake/upstream/select_compute_arch.cmake'
    - 'python/mxnet/onnx/mx2onnx/_export_onnx.py'
    - 'python/mxnet/onnx/mx2onnx/_op_translations/_op_translations_opset12.py'
    - 'python/mxnet/onnx/mx2onnx/_op_translations/_op_translations_opset13.py'
    - 'src/operator/contrib/erfinv-inl.h'
    - 'src/operator/numpy/np_einsum_op-inl.h'
    - 'src/operator/numpy/np_einsum_op.cc'
    - 'src/operator/numpy/np_einsum_path_op-inl.h'
    # files licensed under mit license
    - 'src/operator/modulated_deformable_convolution-inl.h'
    - 'src/operator/modulated_deformable_convolution.cc'
    - 'src/operator/modulated_deformable_convolution.cu'
    - 'src/operator/nn/layer_norm_cpu.h'
    # symlinks
    - 'include/dlpack' # symlink to 3rdparty/dlpack/include/dlpack
    - 'include/dmlc' # symlink to 3rdparty/dmlc-core/include/dmlc
    - 'include/mshadow' # symlink to 3rdparty/mshadow/mshadow
    - 'include/onednn' # symlinks to 3rdparty/onednn
    - 'include/nnvm' # symlinks to 3rdparty/tvm/nnvm/include/nnvm
    # test/build data
    - 'tests/python/dnnl/data/test_dnnl_test_dnnl_model_model1.json'


  comment: on-failure


================================================
FILE: .mxnet_root
================================================
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements.  See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership.  The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License.  You may obtain a copy of the License at

#   http://www.apache.org/licenses/LICENSE-2.0

# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.

# This file marks the root directory of the Apache MXNet repository.


================================================
FILE: 3rdparty/ctc_include/LICENSE
================================================
                                 Apache License
                           Version 2.0, January 2004
                        http://www.apache.org/licenses/

   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION

   1. Definitions.

      "License" shall mean the terms and conditions for use, reproduction,
      and distribution as defined by Sections 1 through 9 of this document.

      "Licensor" shall mean the copyright owner or entity authorized by
      the copyright owner that is granting the License.

      "Legal Entity" shall mean the union of the acting entity and all
      other entities that control, are controlled by, or are under common
      control with that entity. For the purposes of this definition,
      "control" means (i) the power, direct or indirect, to cause the
      direction or management of such entity, whether by contract or
      otherwise, or (ii) ownership of fifty percent (50%) or more of the
      outstanding shares, or (iii) beneficial ownership of such entity.

      "You" (or "Your") shall mean an individual or Legal Entity
      exercising permissions granted by this License.

      "Source" form shall mean the preferred form for making modifications,
      including but not limited to software source code, documentation
      source, and configuration files.

      "Object" form shall mean any form resulting from mechanical
      transformation or translation of a Source form, including but
      not limited to compiled object code, generated documentation,
      and conversions to other media types.

      "Work" shall mean the work of authorship, whether in Source or
      Object form, made available under the License, as indicated by a
      copyright notice that is included in or attached to the work
      (an example is provided in the Appendix below).

      "Derivative Works" shall mean any work, whether in Source or Object
      form, that is based on (or derived from) the Work and for which the
      editorial revisions, annotations, elaborations, or other modifications
      represent, as a whole, an original work of authorship. For the purposes
      of this License, Derivative Works shall not include works that remain
      separable from, or merely link (or bind by name) to the interfaces of,
      the Work and Derivative Works thereof.

      "Contribution" shall mean any work of authorship, including
      the original version of the Work and any modifications or additions
      to that Work or Derivative Works thereof, that is intentionally
      submitted to Licensor for inclusion in the Work by the copyright owner
      or by an individual or Legal Entity authorized to submit on behalf of
      the copyright owner. For the purposes of this definition, "submitted"
      means any form of electronic, verbal, or written communication sent
      to the Licensor or its representatives, including but not limited to
      communication on electronic mailing lists, source code control systems,
      and issue tracking systems that are managed by, or on behalf of, the
      Licensor for the purpose of discussing and improving the Work, but
      excluding communication that is conspicuously marked or otherwise
      designated in writing by the copyright owner as "Not a Contribution."

      "Contributor" shall mean Licensor and any individual or Legal Entity
      on behalf of whom a Contribution has been received by Licensor and
      subsequently incorporated within the Work.

   2. Grant of Copyright License. Subject to the terms and conditions of
      this License, each Contributor hereby grants to You a perpetual,
      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
      copyright license to reproduce, prepare Derivative Works of,
      publicly display, publicly perform, sublicense, and distribute the
      Work and such Derivative Works in Source or Object form.

   3. Grant of Patent License. Subject to the terms and conditions of
      this License, each Contributor hereby grants to You a perpetual,
      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
      (except as stated in this section) patent license to make, have made,
      use, offer to sell, sell, import, and otherwise transfer the Work,
      where such license applies only to those patent claims licensable
      by such Contributor that are necessarily infringed by their
      Contribution(s) alone or by combination of their Contribution(s)
      with the Work to which such Contribution(s) was submitted. If You
      institute patent litigation against any entity (including a
      cross-claim or counterclaim in a lawsuit) alleging that the Work
      or a Contribution incorporated within the Work constitutes direct
      or contributory patent infringement, then any patent licenses
      granted to You under this License for that Work shall terminate
      as of the date such litigation is filed.

   4. Redistribution. You may reproduce and distribute copies of the
      Work or Derivative Works thereof in any medium, with or without
      modifications, and in Source or Object form, provided that You
      meet the following conditions:

      (a) You must give any other recipients of the Work or
          Derivative Works a copy of this License; and

      (b) You must cause any modified files to carry prominent notices
          stating that You changed the files; and

      (c) You must retain, in the Source form of any Derivative Works
          that You distribute, all copyright, patent, trademark, and
          attribution notices from the Source form of the Work,
          excluding those notices that do not pertain to any part of
          the Derivative Works; and

      (d) If the Work includes a "NOTICE" text file as part of its
          distribution, then any Derivative Works that You distribute must
          include a readable copy of the attribution notices contained
          within such NOTICE file, excluding those notices that do not
          pertain to any part of the Derivative Works, in at least one
          of the following places: within a NOTICE text file distributed
          as part of the Derivative Works; within the Source form or
          documentation, if provided along with the Derivative Works; or,
          within a display generated by the Derivative Works, if and
          wherever such third-party notices normally appear. The contents
          of the NOTICE file are for informational purposes only and
          do not modify the License. You may add Your own attribution
          notices within Derivative Works that You distribute, alongside
          or as an addendum to the NOTICE text from the Work, provided
          that such additional attribution notices cannot be construed
          as modifying the License.

      You may add Your own copyright statement to Your modifications and
      may provide additional or different license terms and conditions
      for use, reproduction, or distribution of Your modifications, or
      for any such Derivative Works as a whole, provided Your use,
      reproduction, and distribution of the Work otherwise complies with
      the conditions stated in this License.

   5. Submission of Contributions. Unless You explicitly state otherwise,
      any Contribution intentionally submitted for inclusion in the Work
      by You to the Licensor shall be under the terms and conditions of
      this License, without any additional terms or conditions.
      Notwithstanding the above, nothing herein shall supersede or modify
      the terms of any separate license agreement you may have executed
      with Licensor regarding such Contributions.

   6. Trademarks. This License does not grant permission to use the trade
      names, trademarks, service marks, or product names of the Licensor,
      except as required for reasonable and customary use in describing the
      origin of the Work and reproducing the content of the NOTICE file.

   7. Disclaimer of Warranty. Unless required by applicable law or
      agreed to in writing, Licensor provides the Work (and each
      Contributor provides its Contributions) on an "AS IS" BASIS,
      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
      implied, including, without limitation, any warranties or conditions
      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
      PARTICULAR PURPOSE. You are solely responsible for determining the
      appropriateness of using or redistributing the Work and assume any
      risks associated with Your exercise of permissions under this License.

   8. Limitation of Liability. In no event and under no legal theory,
      whether in tort (including negligence), contract, or otherwise,
      unless required by applicable law (such as deliberate and grossly
      negligent acts) or agreed to in writing, shall any Contributor be
      liable to You for damages, including any direct, indirect, special,
      incidental, or consequential damages of any character arising as a
      result of this License or out of the use or inability to use the
      Work (including but not limited to damages for loss of goodwill,
      work stoppage, computer failure or malfunction, or any and all
      other commercial damages or losses), even if such Contributor
      has been advised of the possibility of such damages.

   9. Accepting Warranty or Additional Liability. While redistributing
      the Work or Derivative Works thereof, You may choose to offer,
      and charge a fee for, acceptance of support, warranty, indemnity,
      or other liability obligations and/or rights consistent with this
      License. However, in accepting such obligations, You may act only
      on Your own behalf and on Your sole responsibility, not on behalf
      of any other Contributor, and only if You agree to indemnify,
      defend, and hold each Contributor harmless for any liability
      incurred by, or claims asserted against, such Contributor by reason
      of your accepting any such warranty or additional liability.

   END OF TERMS AND CONDITIONS

   APPENDIX: How to apply the Apache License to your work.

      To apply the Apache License to your work, attach the following
      boilerplate notice, with the fields enclosed by brackets "[]"
      replaced with your own identifying information. (Don't include
      the brackets!)  The text should be enclosed in the appropriate
      comment syntax for the file format. We also recommend that a
      file or class name and description of purpose be included on the
      same "printed page" as the copyright notice for easier
      identification within third-party archives.

   Copyright [yyyy] [name of copyright owner]

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.

   ----

   Copyright 2015-2016, Baidu USA LLC.

================================================
FILE: 3rdparty/ctc_include/contrib/moderngpu/LICENSE
================================================
/******************************************************************************
* Copyright (c) 2013, NVIDIA CORPORATION.  All rights reserved.
* 
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*     * Redistributions of source code must retain the above copyright
*       notice, this list of conditions and the following disclaimer.
*     * Redistributions in binary form must reproduce the above copyright
*       notice, this list of conditions and the following disclaimer in the
*       documentation and/or other materials provided with the distribution.
*     * Neither the name of the NVIDIA CORPORATION nor the
*       names of its contributors may be used to endorse or promote products
*       derived from this software without specific prior written permission.
* 
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
* ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/


================================================
FILE: 3rdparty/ctc_include/contrib/moderngpu/include/device/ctaloadbalance.cuh
================================================
/******************************************************************************
 * Copyright (c) 2013, NVIDIA CORPORATION.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *     * Redistributions of source code must retain the above copyright
 *       notice, this list of conditions and the following disclaimer.
 *     * Redistributions in binary form must reproduce the above copyright
 *       notice, this list of conditions and the following disclaimer in the
 *       documentation and/or other materials provided with the distribution.
 *     * Neither the name of the NVIDIA CORPORATION nor the
 *       names of its contributors may be used to endorse or promote products
 *       derived from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 ******************************************************************************/

/******************************************************************************
 *
 * Code and text by Sean Baxter, NVIDIA Research
 * See http://nvlabs.github.io/moderngpu for repository and documentation.
 *
 ******************************************************************************/

#pragma once

#include "ctasearch.cuh"
#include "loadstore.cuh"

namespace mgpu {

////////////////////////////////////////////////////////////////////////////////
// DeviceLoadBalancingSearch
// Upper Bound search from A (needles) into B (haystack). The A values are
// natural numbers from aBegin to aEnd. bFirst is the index of the B value at
// bBegin in shared memory.

template<int VT, bool RangeCheck>
MGPU_DEVICE void DeviceSerialLoadBalanceSearch(const int* b_shared, int aBegin,
	int aEnd, int bFirst, int bBegin, int bEnd, int* a_shared) {

	int bKey = b_shared[bBegin];

	#pragma unroll
	for(int i = 0; i < VT; ++i) {
		bool p;
		if(RangeCheck)
			p = (aBegin < aEnd) && ((bBegin >= bEnd) || (aBegin < bKey));
		else
			p = aBegin < bKey;

		if(p)
			// Advance A (the needle).
			a_shared[aBegin++] = bFirst + bBegin;
		else
			// Advance B (the haystack).
			bKey = b_shared[++bBegin];
	}
}

////////////////////////////////////////////////////////////////////////////////
// CTALoadBalance
// Computes upper_bound(counting_iterator<int>(first), b_global) - 1.

// Unlike most other CTA* functions, CTALoadBalance loads from global memory.
// This returns the loaded B elements at the beginning or end of shared memory
// depending on the aFirst argument.

// CTALoadBalance requires NT * VT + 2 slots of shared memory.
template<int NT, int VT, typename InputIt>
MGPU_DEVICE int4 CTALoadBalance(int destCount, InputIt b_global,
	int sourceCount, int block, int tid, const int* mp_global,
	int* indices_shared, bool loadPrecedingB) {

	int4 range = ComputeMergeRange(destCount, sourceCount, block, 0, NT * VT,
		mp_global);

	int a0 = range.x;
	int a1 = range.y;
	int b0 = range.z;
	int b1 = range.w;
	if(!b0) loadPrecedingB = false;

	// Load one trailing term from B. If we're already at the end, fill the
	// end of the buffer with destCount.
	int aCount = a1 - a0;
	int bCount = b1 - b0;
	int extended = b1 < sourceCount;
	int loadCount = bCount + extended;
	int fillCount = NT * VT + 1 - loadCount - aCount;

	int* a_shared = indices_shared;
	int* b_shared = indices_shared + aCount + (int)loadPrecedingB;

	// Load the B values.
//	DeviceMemToMemLoop<NT>(bCount + extended + (int)loadPrecedingB,
//		b_global + b0 - (int)loadPrecedingB, tid,
//		b_shared - (int)loadPrecedingB);

	for(int i = tid - (int)loadPrecedingB; i < bCount + extended; i += NT)
		b_shared[i] = b_global[b0 + i];

	// Fill the end of the array with destCount.
	for(int i = tid + extended; i < fillCount; i += NT)
		b_shared[bCount + i] = destCount;
	__syncthreads();

	// Run a merge path to find the start of the serial merge for each thread.
	int diag = VT * tid;
	int mp = MergePath<MgpuBoundsUpper>(mgpu::counting_iterator<int>(a0),
		aCount, b_shared, bCount, diag, mgpu::less<int>());

	int a0tid = a0 + mp;
	int b0tid = diag - mp;

	// Subtract 1 from b0 because we want to return upper_bound - 1.
	DeviceSerialLoadBalanceSearch<VT, false>(b_shared, a0tid, a1, b0 - 1,
		b0tid, bCount, a_shared - a0);
	__syncthreads();

	b0 -= (int)loadPrecedingB;
	return make_int4(a0, a1, b0, b1);
}


} // namespace mgpu


================================================
FILE: 3rdparty/ctc_include/contrib/moderngpu/include/device/ctamerge.cuh
================================================
/******************************************************************************
 * Copyright (c) 2013, NVIDIA CORPORATION.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *     * Redistributions of source code must retain the above copyright
 *       notice, this list of conditions and the following disclaimer.
 *     * Redistributions in binary form must reproduce the above copyright
 *       notice, this list of conditions and the following disclaimer in the
 *       documentation and/or other materials provided with the distribution.
 *     * Neither the name of the NVIDIA CORPORATION nor the
 *       names of its contributors may be used to endorse or promote products
 *       derived from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 ******************************************************************************/

/******************************************************************************
 *
 * Code and text by Sean Baxter, NVIDIA Research
 * See http://nvlabs.github.io/moderngpu for repository and documentation.
 *
 ******************************************************************************/

#pragma once

#include "ctasearch.cuh"
#include "loadstore.cuh"
#include "sortnetwork.cuh"

namespace mgpu {

////////////////////////////////////////////////////////////////////////////////
// SerialMerge

template<int VT, bool RangeCheck, typename T, typename Comp>
MGPU_DEVICE void SerialMerge(const T* keys_shared, int aBegin, int aEnd,
	int bBegin, int bEnd, T* results, int* indices, Comp comp) {

	T aKey = keys_shared[aBegin];
	T bKey = keys_shared[bBegin];

	#pragma unroll
	for(int i = 0; i < VT; ++i) {
		bool p;
		if(RangeCheck)
			p = (bBegin >= bEnd) || ((aBegin < aEnd) && !comp(bKey, aKey));
		else
			p = !comp(bKey, aKey);

		results[i] = p ? aKey : bKey;
		indices[i] = p ? aBegin : bBegin - !RangeCheck;

		if(p) aKey = keys_shared[++aBegin];
		else bKey = keys_shared[++bBegin];
	}
	__syncthreads();
}

////////////////////////////////////////////////////////////////////////////////
// FindMergeFrame and FindMergesortInterval help mergesort (both CTA and global
// merge pass levels) locate lists within the single source array.

// Returns (offset of a, offset of b, length of list).
MGPU_HOST_DEVICE int3 FindMergesortFrame(int coop, int block, int nv) {
	// coop is the number of CTAs or threads cooperating to merge two lists into
	// one. We round block down to the first CTA's ID that is working on this
	// merge.
	int start = ~(coop - 1) & block;
	int size = nv * (coop>> 1);
	return make_int3(nv * start, nv * start + size, size);
}

// Returns (a0, a1, b0, b1) into mergesort input lists between mp0 and mp1.
MGPU_HOST_DEVICE int4 FindMergesortInterval(int3 frame, int coop, int block,
	int nv, int count, int mp0, int mp1) {

	// Locate diag from the start of the A sublist.
	int diag = nv * block - frame.x;
	int a0 = frame.x + mp0;
	int a1 = min(count, frame.x + mp1);
	int b0 = min(count, frame.y + diag - mp0);
	int b1 = min(count, frame.y + diag + nv - mp1);

	// The end partition of the last block for each merge operation is computed
	// and stored as the begin partition for the subsequent merge. i.e. it is
	// the same partition but in the wrong coordinate system, so its 0 when it
	// should be listSize. Correct that by checking if this is the last block
	// in this merge operation.
	if(coop - 1 == ((coop - 1) & block)) {
		a1 = min(count, frame.x + frame.z);
		b1 = min(count, frame.y + frame.z);
	}
	return make_int4(a0, a1, b0, b1);
}

////////////////////////////////////////////////////////////////////////////////
// ComputeMergeRange

MGPU_HOST_DEVICE int4 ComputeMergeRange(int aCount, int bCount, int block,
	int coop, int NV, const int* mp_global) {

	// Load the merge paths computed by the partitioning kernel.
	int mp0 = mp_global[block];
	int mp1 = mp_global[block + 1];
	int gid = NV * block;

	// Compute the ranges of the sources in global memory.
	int4 range;
	if(coop) {
		int3 frame = FindMergesortFrame(coop, block, NV);
		range = FindMergesortInterval(frame, coop, block, NV, aCount, mp0,
			mp1);
	} else {
		range.x = mp0;											// a0
		range.y = mp1;											// a1
		range.z = gid - range.x;								// b0
		range.w = min(aCount + bCount, gid + NV) - range.y;		// b1
	}
	return range;
}

////////////////////////////////////////////////////////////////////////////////
// CTA mergesort support

template<int NT, int VT, typename T, typename Comp>
MGPU_DEVICE void CTABlocksortPass(T* keys_shared, int tid, int count,
	int coop, T* keys, int* indices, Comp comp) {

	int list = ~(coop - 1) & tid;
	int diag = min(count, VT * ((coop - 1) & tid));
	int start = VT * list;
	int a0 = min(count, start);
	int b0 = min(count, start + VT * (coop / 2));
	int b1 = min(count, start + VT * coop);

	int p = MergePath<MgpuBoundsLower>(keys_shared + a0, b0 - a0,
		keys_shared + b0, b1 - b0, diag, comp);

	SerialMerge<VT, true>(keys_shared, a0 + p, b0, b0 + diag - p, b1, keys,
		indices, comp);
}

template<int NT, int VT, bool HasValues, typename KeyType, typename ValType,
	typename Comp>
MGPU_DEVICE void CTABlocksortLoop(ValType threadValues[VT],
	KeyType* keys_shared, ValType* values_shared, int tid, int count,
	Comp comp) {

	#pragma unroll
	for(int coop = 2; coop <= NT; coop *= 2) {
		int indices[VT];
		KeyType keys[VT];
		CTABlocksortPass<NT, VT>(keys_shared, tid, count, coop, keys,
			indices, comp);

		if(HasValues) {
			// Exchange the values through shared memory.
			DeviceThreadToShared<VT>(threadValues, tid, values_shared);
			DeviceGather<NT, VT>(NT * VT, values_shared, indices, tid,
				threadValues);
		}

		// Store results in shared memory in sorted order.
		DeviceThreadToShared<VT>(keys, tid, keys_shared);
	}
}

////////////////////////////////////////////////////////////////////////////////
// CTAMergesort
// Caller provides the keys in shared memory. This functions sorts the first
// count elements.

template<int NT, int VT, bool Stable, bool HasValues, typename KeyType,
	typename ValType, typename Comp>
MGPU_DEVICE void CTAMergesort(KeyType threadKeys[VT], ValType threadValues[VT],
	KeyType* keys_shared, ValType* values_shared, int count, int tid,
	Comp comp) {

	// Stable sort the keys in the thread.
	if(VT * tid < count) {
		if(Stable)
			OddEvenTransposeSort<VT>(threadKeys, threadValues, comp);
		else
			OddEvenMergesort<VT>(threadKeys, threadValues, comp);
	}

	// Store the locally sorted keys into shared memory.
	DeviceThreadToShared<VT>(threadKeys, tid, keys_shared);

	// Recursively merge lists until the entire CTA is sorted.
	CTABlocksortLoop<NT, VT, HasValues>(threadValues, keys_shared,
		values_shared, tid, count, comp);
}

template<int NT, int VT, bool Stable, typename KeyType, typename Comp>
MGPU_DEVICE void CTAMergesortKeys(KeyType threadKeys[VT],
	KeyType* keys_shared, int count, int tid, Comp comp) {

	int valuesTemp[VT];
	CTAMergesort<NT, VT, Stable, false>(threadKeys, valuesTemp, keys_shared,
		(int*)keys_shared, count, tid, comp);
}

template<int NT, int VT, bool Stable, typename KeyType, typename ValType,
	typename Comp>
MGPU_DEVICE void CTAMergesortPairs(KeyType threadKeys[VT],
	ValType threadValues[VT], KeyType* keys_shared, ValType* values_shared,
	int count, int tid, Comp comp) {

	CTAMergesort<NT, VT, Stable, true>(threadKeys, threadValues, keys_shared,
		values_shared, count, tid, comp);
}

////////////////////////////////////////////////////////////////////////////////
// DeviceMergeKeysIndices

template<int NT, int VT, bool LoadExtended, typename It1, typename It2,
	typename T, typename Comp>
MGPU_DEVICE void DeviceMergeKeysIndices(It1 a_global, int aCount, It2 b_global,
	int bCount, int4 range, int tid, T* keys_shared, T* results, int* indices,
	Comp comp) {

	int a0 = range.x;
	int a1 = range.y;
	int b0 = range.z;
	int b1 = range.w;

	if(LoadExtended) {
		bool extended = (a1 < aCount) && (b1 < bCount);
		aCount = a1 - a0;
		bCount = b1 - b0;
		int aCount2 = aCount + (int)extended;
		int bCount2 = bCount + (int)extended;

		// Load one element past the end of each input to avoid having to use
		// range checking in the merge loop.
		DeviceLoad2ToShared<NT, VT, VT + 1>(a_global + a0, aCount2,
			b_global + b0, bCount2, tid, keys_shared);

		// Run a Merge Path search for each thread's starting point.
		int diag = VT * tid;
		int mp = MergePath<MgpuBoundsLower>(keys_shared, aCount,
			keys_shared + aCount2, bCount, diag, comp);

		// Compute the ranges of the sources in shared memory.
		int a0tid = mp;
		int b0tid = aCount2 + diag - mp;
		if(extended) {
			SerialMerge<VT, false>(keys_shared, a0tid, 0, b0tid, 0, results,
				indices, comp);
		} else {
			int a1tid = aCount;
			int b1tid = aCount2 + bCount;
			SerialMerge<VT, true>(keys_shared, a0tid, a1tid, b0tid, b1tid,
				results, indices, comp);
		}
	} else {
		// Use the input intervals from the ranges between the merge path
		// intersections.
		aCount = a1 - a0;
		bCount = b1 - b0;

		// Load the data into shared memory.
		DeviceLoad2ToShared<NT, VT, VT>(a_global + a0, aCount, b_global + b0,
			bCount, tid, keys_shared);

		// Run a merge path to find the start of the serial merge for each
		// thread.
		int diag = VT * tid;
		int mp = MergePath<MgpuBoundsLower>(keys_shared, aCount,
			keys_shared + aCount, bCount, diag, comp);

		// Compute the ranges of the sources in shared memory.
		int a0tid = mp;
		int a1tid = aCount;
		int b0tid = aCount + diag - mp;
		int b1tid = aCount + bCount;

		// Serial merge into register.
		SerialMerge<VT, true>(keys_shared, a0tid, a1tid, b0tid, b1tid, results,
			indices, comp);
	}
}

////////////////////////////////////////////////////////////////////////////////
// DeviceMerge
// Merge pairs from global memory into global memory. Useful factorization to
// enable calling from merge, mergesort, and locality sort.

template<int NT, int VT, bool HasValues, bool LoadExtended, typename KeysIt1,
	typename KeysIt2, typename KeysIt3, typename ValsIt1, typename ValsIt2,
	typename KeyType, typename ValsIt3, typename Comp>
MGPU_DEVICE void DeviceMerge(KeysIt1 aKeys_global, ValsIt1 aVals_global,
	int aCount, KeysIt2 bKeys_global, ValsIt2 bVals_global, int bCount,
	int tid, int block, int4 range, KeyType* keys_shared, int* indices_shared,
	KeysIt3 keys_global, ValsIt3 vals_global, Comp comp) {

	KeyType results[VT];
	int indices[VT];
	DeviceMergeKeysIndices<NT, VT, LoadExtended>(aKeys_global, aCount,
		bKeys_global, bCount, range, tid, keys_shared, results, indices, comp);

	// Store merge results back to shared memory.
	DeviceThreadToShared<VT>(results, tid, keys_shared);

	// Store merged keys to global memory.
	aCount = range.y - range.x;
	bCount = range.w - range.z;
	DeviceSharedToGlobal<NT, VT>(aCount + bCount, keys_shared, tid,
		keys_global + NT * VT * block);

	// Copy the values.
	if(HasValues) {
		DeviceThreadToShared<VT>(indices, tid, indices_shared);

		DeviceTransferMergeValuesShared<NT, VT>(aCount + bCount,
			aVals_global + range.x, bVals_global + range.z, aCount,
			indices_shared, tid, vals_global + NT * VT * block);
	}
}

} // namespace mgpu


================================================
FILE: 3rdparty/ctc_include/contrib/moderngpu/include/device/ctascan.cuh
================================================
/******************************************************************************
 * Copyright (c) 2013, NVIDIA CORPORATION.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *     * Redistributions of source code must retain the above copyright
 *       notice, this list of conditions and the following disclaimer.
 *     * Redistributions in binary form must reproduce the above copyright
 *       notice, this list of conditions and the following disclaimer in the
 *       documentation and/or other materials provided with the distribution.
 *     * Neither the name of the NVIDIA CORPORATION nor the
 *       names of its contributors may be used to endorse or promote products
 *       derived from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 ******************************************************************************/

/******************************************************************************
 *
 * Code and text by Sean Baxter, NVIDIA Research
 * See http://nvlabs.github.io/moderngpu for repository and documentation.
 *
 ******************************************************************************/

#pragma once

#include "../mgpuenums.h"
#include "deviceutil.cuh"
#include "intrinsics.cuh"

namespace mgpu {

////////////////////////////////////////////////////////////////////////////////
// CTAReduce

template<int NT, typename Op = mgpu::plus<int> >
struct CTAReduce {
	typedef typename Op::first_argument_type T;
	enum { Size = NT, Capacity = NT };
	struct Storage { T shared[Capacity]; };

	MGPU_DEVICE static T Reduce(int tid, T x, Storage& storage, Op op = Op()) {
		storage.shared[tid] = x;
		__syncthreads();

		// Fold the data in half with each pass.
		#pragma unroll
		for(int destCount = NT / 2; destCount >= 1; destCount /= 2) {
			if(tid < destCount) {
				// Read from the right half and store to the left half.
				x = op(x, storage.shared[destCount + tid]);
				storage.shared[tid] = x;
			}
			__syncthreads();
		}
		T total = storage.shared[0];
		__syncthreads();
		return total;
	}
};

#if __CUDA_ARCH__ >= 300

template<int NT>
struct CTAReduce<NT, mgpu::plus<int> > {
	typedef mgpu::plus<int> Op;
	typedef int T;
	enum { Size = NT, Capacity = WARP_SIZE };
	struct Storage { int shared[Capacity]; };

	MGPU_DEVICE static int Reduce(int tid, int x, Storage& storage,
		Op op = Op()) {

		const int NumSections = WARP_SIZE;
		const int SecSize = NT / NumSections;
		int lane = (SecSize - 1) & tid;
		int sec = tid / SecSize;

		// In the first phase, threads cooperatively find the reduction within
		// their segment. The segments are SecSize threads (NT / WARP_SIZE)
		// wide.
		#pragma unroll
		for(int offset = 1; offset < SecSize; offset *= 2)
			x = shfl_add(x, offset, SecSize);

		// The last thread in each segment stores the local reduction to shared
		// memory.
		if(SecSize - 1 == lane) storage.shared[sec] = x;
		__syncthreads();

		// Reduce the totals of each input segment. The spine is WARP_SIZE
		// threads wide.
		if(tid < NumSections) {
			x = storage.shared[tid];
			#pragma unroll
			for(int offset = 1; offset < NumSections; offset *= 2)
				x = shfl_add(x, offset, NumSections);
			storage.shared[tid] = x;
		}
		__syncthreads();

		int reduction = storage.shared[NumSections - 1];
		__syncthreads();

		return reduction;
	}
};

template<int NT>
struct CTAReduce<NT, mgpu::maximum<int> > {
	typedef mgpu::maximum<int> Op;
	enum { Size = NT, Capacity = WARP_SIZE };
	struct Storage { int shared[Capacity]; };

	MGPU_DEVICE static int Reduce(int tid, int x, Storage& storage,
		Op op = Op()) {

		const int NumSections = WARP_SIZE;
		const int SecSize = NT / NumSections;
		int lane = (SecSize - 1) & tid;
		int sec = tid / SecSize;

		#pragma unroll
		for(int offset = 1; offset < SecSize; offset *= 2)
			x = shfl_max(x, offset, SecSize);

		if(SecSize - 1 == lane) storage.shared[sec] = x;
		__syncthreads();

		if(tid < NumSections) {
			x = storage.shared[tid];
			#pragma unroll
			for(int offset = 1; offset < NumSections; offset *= 2)
				x = shfl_max(x, offset, NumSections);
			storage.shared[tid] = x;
		}
		__syncthreads();

		int reduction = storage.shared[NumSections - 1];
		__syncthreads();

		return reduction;
	}
};

#endif // __CUDA_ARCH__ >= 300

////////////////////////////////////////////////////////////////////////////////
// CTAScan

template<int NT, typename Op = mgpu::plus<int> >
struct CTAScan {
	typedef typename Op::result_type T;
	enum { Size = NT, Capacity = 2 * NT + 1 };
	struct Storage { T shared[Capacity]; };

	MGPU_DEVICE static T Scan(int tid, T x, Storage& storage, T* total,
		MgpuScanType type = MgpuScanTypeExc, T identity = (T)0, Op op = Op()) {

		storage.shared[tid] = x;
		int first = 0;
		__syncthreads();

		#pragma unroll
		for(int offset = 1; offset < NT; offset += offset) {
			if(tid >= offset)
				x = op(storage.shared[first + tid - offset], x);
			first = NT - first;
			storage.shared[first + tid] = x;
			__syncthreads();
		}
		*total = storage.shared[first + NT - 1];

		if(MgpuScanTypeExc == type)
			x = tid ? storage.shared[first + tid - 1] : identity;

		__syncthreads();
		return x;
	}
	MGPU_DEVICE static T Scan(int tid, T x, Storage& storage) {
		T total;
		return Scan(tid, x, storage, &total, MgpuScanTypeExc, (T)0, Op());
	}
};

////////////////////////////////////////////////////////////////////////////////
// Special partial specialization for CTAScan<NT, ScanOpAdd> on Kepler.
// This uses the shfl intrinsic to reduce scan latency.

#if __CUDA_ARCH__ >= 300

template<int NT>
struct CTAScan<NT, mgpu::plus<int> > {
	typedef mgpu::plus<int> Op;
	enum { Size = NT, NumSegments = WARP_SIZE, SegSize = NT / NumSegments };
	enum { Capacity = NumSegments + 1 };
	struct Storage { int shared[Capacity + 1]; };

	MGPU_DEVICE static int Scan(int tid, int x, Storage& storage, int* total,
		MgpuScanType type = MgpuScanTypeExc, int identity = 0, Op op = Op()) {

		// Define WARP_SIZE segments that are NT / WARP_SIZE large.
		// Each warp makes log(SegSize) shfl_add calls.
		// The spine makes log(WARP_SIZE) shfl_add calls.
		int lane = (SegSize - 1) & tid;
		int segment = tid / SegSize;

		// Scan each segment using shfl_add.
		int scan = x;
		#pragma unroll
		for(int offset = 1; offset < SegSize; offset *= 2)
			scan = shfl_add(scan, offset, SegSize);

		// Store the reduction (last element) of each segment into storage.
		if(SegSize - 1 == lane) storage.shared[segment] = scan;
		__syncthreads();

		// Warp 0 does a full shfl warp scan on the partials. The total is
		// stored to shared[NumSegments]. (NumSegments = WARP_SIZE)
		if(tid < NumSegments) {
			int y = storage.shared[tid];
			int scan = y;
			#pragma unroll
			for(int offset = 1; offset < NumSegments; offset *= 2)
				scan = shfl_add(scan, offset, NumSegments);
			storage.shared[tid] = scan - y;
			if(NumSegments - 1 == tid) storage.shared[NumSegments] = scan;
		}
		__syncthreads();

		// Add the scanned partials back in and convert to exclusive scan.
		scan += storage.shared[segment];
		if(MgpuScanTypeExc == type) {
			scan -= x;
			if(identity && !tid) scan = identity;
		}
		*total = storage.shared[NumSegments];
		__syncthreads();

		return scan;
	}
	MGPU_DEVICE static int Scan(int tid, int x, Storage& storage) {
		int total;
		return Scan(tid, x, storage, &total, MgpuScanTypeExc, 0);
	}
};

#endif // __CUDA_ARCH__ >= 300

////////////////////////////////////////////////////////////////////////////////
// CTABinaryScan

template<int NT>
MGPU_DEVICE int CTABinaryScan(int tid, bool x, int* shared, int* total) {
	const int NumWarps = NT / WARP_SIZE;
	int warp = tid / WARP_SIZE;
	int lane = (WARP_SIZE - 1);

	// Store the bit totals for each warp.
	uint bits = __ballot(x);
	shared[warp] = popc(bits);
	__syncthreads();

#if __CUDA_ARCH__ >= 300
	if(tid < NumWarps) {
		int x = shared[tid];
		int scan = x;
		#pragma unroll
		for(int offset = 1; offset < NumWarps; offset *= 2)
			scan = shfl_add(scan, offset, NumWarps);
		shared[tid] = scan - x;
	}
	__syncthreads();

#else
	// Thread 0 scans warp totals.
	if(!tid) {
		int scan = 0;
		#pragma unroll
		for(int i = 0; i < NumWarps; ++i) {
			int y = shared[i];
			shared[i] = scan;
			scan += y;
		}
		shared[NumWarps] = scan;
	}
	__syncthreads();

#endif // __CUDA_ARCH__ >= 300

	// Add the warp scan back into the partials.
	int scan = shared[warp] + __popc(bfe(bits, 0, lane));
	*total = shared[NumWarps];
	__syncthreads();
	return scan;
}

} // namespace mgpu


================================================
FILE: 3rdparty/ctc_include/contrib/moderngpu/include/device/ctasearch.cuh
================================================
/******************************************************************************
 * Copyright (c) 2013, NVIDIA CORPORATION.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *     * Redistributions of source code must retain the above copyright
 *       notice, this list of conditions and the following disclaimer.
 *     * Redistributions in binary form must reproduce the above copyright
 *       notice, this list of conditions and the following disclaimer in the
 *       documentation and/or other materials provided with the distribution.
 *     * Neither the name of the NVIDIA CORPORATION nor the
 *       names of its contributors may be used to endorse or promote products
 *       derived from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 ******************************************************************************/

/******************************************************************************
 *
 * Code and text by Sean Baxter, NVIDIA Research
 * See http://nvlabs.github.io/moderngpu for repository and documentation.
 *
 ******************************************************************************/

#pragma once

#include "deviceutil.cuh"
#include "../mgpudevice.cuh"

namespace mgpu {

template<MgpuBounds Bounds, typename IntT, typename It, typename T,
	typename Comp>
MGPU_HOST_DEVICE void BinarySearchIt(It data, int& begin, int& end, T key,
	int shift, Comp comp) {

	IntT scale = (1<< shift) - 1;
	int mid = (int)((begin + scale * end)>> shift);

	T key2 = data[mid];
	bool pred = (MgpuBoundsUpper == Bounds) ?
		!comp(key, key2) :
		comp(key2, key);
	if(pred) begin = mid + 1;
	else end = mid;
}

template<MgpuBounds Bounds, typename IntT, typename T, typename It,
	typename Comp>
MGPU_HOST_DEVICE int BiasedBinarySearch(It data, int count, T key, int levels,
	Comp comp) {

	int begin = 0;
	int end = count;

	if(levels >= 4 && begin < end)
		BinarySearchIt<Bounds, IntT>(data, begin, end, key, 9, comp);
	if(levels >= 3 && begin < end)
		BinarySearchIt<Bounds, IntT>(data, begin, end, key, 7, comp);
	if(levels >= 2 && begin < end)
		BinarySearchIt<Bounds, IntT>(data, begin, end, key, 5, comp);
	if(levels >= 1 && begin < end)
		BinarySearchIt<Bounds, IntT>(data, begin, end, key, 4, comp);

	while(begin < end)
		BinarySearchIt<Bounds, int>(data, begin, end, key, 1, comp);
	return begin;
}

template<MgpuBounds Bounds, typename T, typename It, typename Comp>
MGPU_HOST_DEVICE int BinarySearch(It data, int count, T key, Comp comp) {
	int begin = 0;
	int end = count;
	while(begin < end)
		BinarySearchIt<Bounds, int>(data, begin, end, key, 1, comp);
	return begin;
}

////////////////////////////////////////////////////////////////////////////////
// MergePath search

template<MgpuBounds Bounds, typename It1, typename It2, typename Comp>
MGPU_HOST_DEVICE int MergePath(It1 a, int aCount, It2 b, int bCount, int diag,
	Comp comp) {

	typedef typename std::iterator_traits<It1>::value_type T;
	int begin = max(0, diag - bCount);
	int end = min(diag, aCount);

	while(begin < end) {
		int mid = (begin + end)>> 1;
		T aKey = a[mid];
		T bKey = b[diag - 1 - mid];
		bool pred = (MgpuBoundsUpper == Bounds) ?
			comp(aKey, bKey) :
			!comp(bKey, aKey);
		if(pred) begin = mid + 1;
		else end = mid;
	}
	return begin;
}


////////////////////////////////////////////////////////////////////////////////
// SegmentedMergePath search

template<typename InputIt, typename Comp>
MGPU_HOST_DEVICE int SegmentedMergePath(InputIt keys, int aOffset, int aCount,
	int bOffset, int bCount, int leftEnd, int rightStart, int diag, Comp comp) {

	// leftEnd and rightStart are defined from the origin, and diag is defined
	// from aOffset.
	// We only need to run a Merge Path search if the diagonal intersects the
	// segment that strides the left and right halves (i.e. is between leftEnd
	// and rightStart).
	if(aOffset + diag <= leftEnd) return diag;
	if(aOffset + diag >= rightStart) return aCount;

	bCount = min(bCount, rightStart - bOffset);
	int begin = max(max(leftEnd - aOffset, 0), diag - bCount);
	int end = min(diag, aCount);

	while(begin < end) {
		int mid = (begin + end)>> 1;
		int ai = aOffset + mid;
		int bi = bOffset + diag - 1 - mid;

		bool pred = !comp(keys[bi], keys[ai]);
		if(pred) begin = mid + 1;
		else end = mid;
	}
	return begin;
}

////////////////////////////////////////////////////////////////////////////////
// BalancedPath search

template<bool Duplicates, typename IntT, typename InputIt1, typename InputIt2,
	typename Comp>
MGPU_HOST_DEVICE int2 BalancedPath(InputIt1 a, int aCount, InputIt2 b,
	int bCount, int diag, int levels, Comp comp) {

	typedef typename std::iterator_traits<InputIt1>::value_type T;

	int p = MergePath<MgpuBoundsLower>(a, aCount, b, bCount, diag, comp);
	int aIndex = p;
	int bIndex = diag - p;

	bool star = false;
	if(bIndex < bCount) {
		if(Duplicates) {
			T x = b[bIndex];

			// Search for the beginning of the duplicate run in both A and B.
			// Because
			int aStart = BiasedBinarySearch<MgpuBoundsLower, IntT>(a, aIndex, x,
				levels, comp);
			int bStart = BiasedBinarySearch<MgpuBoundsLower, IntT>(b, bIndex, x,
				levels, comp);

			// The distance between the merge path and the lower_bound is the
			// 'run'. We add up the a- and b- runs and evenly distribute them to
			// get a stairstep path.
			int aRun = aIndex - aStart;
			int bRun = bIndex - bStart;
			int xCount = aRun + bRun;

			// Attempt to advance b and regress a.
			int bAdvance = max(xCount>> 1, bRun);
			int bEnd = min(bCount, bStart + bAdvance + 1);
			int bRunEnd = BinarySearch<MgpuBoundsUpper>(b + bIndex,
				bEnd - bIndex, x, comp) + bIndex;
			bRun = bRunEnd - bStart;

			bAdvance = min(bAdvance, bRun);
			int aAdvance = xCount - bAdvance;

			bool roundUp = (aAdvance == bAdvance + 1) && (bAdvance < bRun);
			aIndex = aStart + aAdvance;

			if(roundUp) star = true;
		} else {
			if(aIndex && aCount) {
				T aKey = a[aIndex - 1];
				T bKey = b[bIndex];

				// If the last consumed element in A (aIndex - 1) is the same as
				// the next element in B (bIndex), we're sitting at a starred
				// partition.
				if(!comp(aKey, bKey)) star = true;
			}
		}
	}
	return make_int2(aIndex, star);
}

} // namespace mgpu


================================================
FILE: 3rdparty/ctc_include/contrib/moderngpu/include/device/ctasegreduce.cuh
================================================
/******************************************************************************
 * Copyright (c) 2013, NVIDIA CORPORATION.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *     * Redistributions of source code must retain the above copyright
 *       notice, this list of conditions and the following disclaimer.
 *     * Redistributions in binary form must reproduce the above copyright
 *       notice, this list of conditions and the following disclaimer in the
 *       documentation and/or other materials provided with the distribution.
 *     * Neither the name of the NVIDIA CORPORATION nor the
 *       names of its contributors may be used to endorse or promote products
 *       derived from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 ******************************************************************************/

/******************************************************************************
 *
 * Code and text by Sean Baxter, NVIDIA Research
 * See http://nvlabs.github.io/moderngpu for repository and documentation.
 *
 ******************************************************************************/

#pragma once

#include "ctasegscan.cuh"
#include "ctasearch.cuh"

namespace mgpu {

////////////////////////////////////////////////////////////////////////////////
// Segmented reduce utility functions.

// Extract the upper-bound indices from the coded ranges. Decrement to include
// the first addressed row/segment.

struct SegReduceRange {
	int begin;
	int end;
	int total;
	bool flushLast;
};

MGPU_DEVICE SegReduceRange DeviceShiftRange(int limit0, int limit1) {
	SegReduceRange range;
	range.begin = 0x7fffffff & limit0;
	range.end = 0x7fffffff & limit1;
	range.total = range.end - range.begin;
	range.flushLast = 0 == (0x80000000 & limit1);
	range.end += !range.flushLast;
	return range;
}

// Reconstitute row/segment indices from a starting row index and packed end
// flags. Used for pre-processed versions of interval reduce and interval Spmv.
template<int VT>
MGPU_DEVICE void DeviceExpandFlagsToRows(int first, int endFlags,
	int rows[VT + 1]) {

	rows[0] = first;
	#pragma unroll
	for(int i = 0; i < VT; ++i) {
		if((1<< i) & endFlags) ++first;
		rows[i + 1] = first;
	}
}

////////////////////////////////////////////////////////////////////////////////
// After loading CSR terms into shared memory, each thread binary searches
// (upper-bound) to find its starting point. Each thread then walks forward,
// emitting the csr0-relative row indices to register.

template<int NT, int VT>
MGPU_DEVICE int DeviceExpandCsrRows(int tidOffset, int* csr_shared,
	int numRows, int end, int rows[VT + 1], int rowStarts[VT]) {

	// Each thread binary searches for its starting row.
	int row = BinarySearch<MgpuBoundsUpper>(csr_shared, numRows, tidOffset,
		mgpu::less<int>()) - 1;

	// Each thread starts at row and scans forward, emitting row IDs into
	// register. Store the CTA-local row index (starts at 0) to rows and the
	// start of the row (globally) to rowStarts.
	int curOffset = csr_shared[row];
	int nextOffset = (row + 1 < numRows) ? csr_shared[row + 1] : end;

	rows[0] = row;
	rowStarts[0] = curOffset;
	int endFlags = 0;

	#pragma unroll
	for(int i = 1; i <= VT; ++i) {
		// Advance the row cursor when the iterator hits the next row offset.
		if(tidOffset + i == nextOffset) {
			// Set an end flag when the cursor advances to the next row.
			endFlags |= 1<< (i - 1);

			// Advance the cursor and load the next row offset.
			++row;
			curOffset = nextOffset;
			nextOffset = (row + 1 < numRows) ? csr_shared[row + 1] : end;
		}
		rows[i] = row;
		if(i < VT) rowStarts[i] = curOffset;
	}
	__syncthreads();

	return endFlags;
}

////////////////////////////////////////////////////////////////////////////////
// DeviceSegReducePrepare
// Expand non-empty interval of CSR elements into row indices. Compute end-flags
// by comparing adjacent row IDs.

// DeviceSegReducePrepare may be called either by a pre-processing kernel or by
// the kernel that actually evaluates the segmented reduction if no preprocesing
// is desired.
struct SegReduceTerms {
	int endFlags;
	int tidDelta;
};

template<int NT, int VT>
MGPU_DEVICE SegReduceTerms DeviceSegReducePrepare(int* csr_shared, int numRows,
	int tid, int gid, bool flushLast, int rows[VT + 1], int rowStarts[VT]) {

	// Pass a sentinel (end) to point to the next segment start. If we flush,
	// this is the end of this tile. Otherwise it is INT_MAX
	int endFlags = DeviceExpandCsrRows<NT, VT>(gid + VT * tid, csr_shared,
		numRows, flushLast ? (gid + NT * VT) : INT_MAX, rows, rowStarts);

	// Find the distance to to scan to compute carry-in for each thread. Use the
	// existance of an end flag anywhere in the thread to determine if carry-out
	// values from the left should propagate through to the right.
	int tidDelta = DeviceFindSegScanDelta<NT>(tid, rows[0] != rows[VT],
		csr_shared);

	SegReduceTerms terms = { endFlags, tidDelta };
	return terms;
}

////////////////////////////////////////////////////////////////////////////////
// CTASegReduce
// Core segmented reduction code. Supports fast-path and slow-path for intra-CTA
// segmented reduction. Stores partials to global memory.
// Callers feed CTASegReduce::ReduceToGlobal values in thread order.
template<int NT, int VT, bool HalfCapacity, typename T, typename Op>
struct CTASegReduce {
	typedef CTASegScan<NT, Op> SegScan;

	enum {
		NV = NT * VT,
		Capacity = HalfCapacity ? (NV / 2) : NV
	};

	union Storage {
		typename SegScan::Storage segScanStorage;
		T values[Capacity];
	};

	template<typename DestIt>
	MGPU_DEVICE static void ReduceToGlobal(const int rows[VT + 1], int total,
		int tidDelta, int startRow, int block, int tid, T data[VT],
		DestIt dest_global, T* carryOut_global, T identity, Op op,
		Storage& storage) {

		// Run a segmented scan within the thread.
		T x, localScan[VT];
		#pragma unroll
		for(int i = 0; i < VT; ++i) {
			x = i ? op(x, data[i]) : data[i];
			localScan[i] = x;
			if(rows[i] != rows[i + 1]) x = identity;
		}

		// Run a parallel segmented scan over the carry-out values to compute
		// carry-in.
		T carryOut;
		T carryIn = SegScan::SegScanDelta(tid, tidDelta, x,
			storage.segScanStorage, &carryOut, identity, op);

		// Store the carry-out for the entire CTA to global memory.
		if(!tid) carryOut_global[block] = carryOut;

		dest_global += startRow;
		if(HalfCapacity && total > Capacity) {
			// Add carry-in to each thread-local scan value. Store directly
			// to global.
			#pragma unroll
			for(int i = 0; i < VT; ++i) {
				// Add the carry-in to the local scan.
				T x2 = op(carryIn, localScan[i]);

				// Store on the end flag and clear the carry-in.
				if(rows[i] != rows[i + 1]) {
					carryIn = identity;
					dest_global[rows[i]] = x2;
				}
			}
		} else {
			// All partials fit in shared memory. Add carry-in to each thread-
			// local scan value.
			#pragma unroll
			for(int i = 0; i < VT; ++i) {
				// Add the carry-in to the local scan.
				T x2 = op(carryIn, localScan[i]);

				// Store reduction when the segment changes and clear the
				// carry-in.
				if(rows[i] != rows[i + 1]) {
					storage.values[rows[i]] = x2;
					carryIn = identity;
				}
			}
			__syncthreads();

			// Cooperatively store reductions to global memory.
			for(int index = tid; index < total; index += NT)
				dest_global[index] = storage.values[index];
			__syncthreads();
		}
	}
};

} // namespace mgpu



================================================
FILE: 3rdparty/ctc_include/contrib/moderngpu/include/device/ctasegscan.cuh
================================================
/******************************************************************************
 * Copyright (c) 2013, NVIDIA CORPORATION.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *     * Redistributions of source code must retain the above copyright
 *       notice, this list of conditions and the following disclaimer.
 *     * Redistributions in binary form must reproduce the above copyright
 *       notice, this list of conditions and the following disclaimer in the
 *       documentation and/or other materials provided with the distribution.
 *     * Neither the name of the NVIDIA CORPORATION nor the
 *       names of its contributors may be used to endorse or promote products
 *       derived from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 ******************************************************************************/

/******************************************************************************
 *
 * Code and text by Sean Baxter, NVIDIA Research
 * See http://nvlabs.github.io/moderngpu for repository and documentation.
 *
 ******************************************************************************/

#pragma once

#include "ctascan.cuh"

namespace mgpu {

////////////////////////////////////////////////////////////////////////////////
// DeviceFindSegScanDelta
// Runs an inclusive max-index scan over binary inputs.

template<int NT>
MGPU_DEVICE int DeviceFindSegScanDelta(int tid, bool flag, int* delta_shared) {
	const int NumWarps = NT / 32;

	int warp = tid / 32;
	int lane = 31 & tid;
	uint warpMask = 0xffffffff>> (31 - lane);		// inclusive search
	uint ctaMask = 0x7fffffff>> (31 - lane);		// exclusive search

	uint warpBits = __ballot(flag);
	delta_shared[warp] = warpBits;
	__syncthreads();

	if(tid < NumWarps) {
		uint ctaBits = __ballot(0 != delta_shared[tid]);
		int warpSegment = 31 - clz(ctaMask & ctaBits);
		int start = (-1 != warpSegment) ?
			(31 - clz(delta_shared[warpSegment]) + 32 * warpSegment) : 0;
		delta_shared[NumWarps + tid] = start;
	}
	__syncthreads();

	// Find the closest flag to the left of this thread within the warp.
	// Include the flag for this thread.
	int start = 31 - clz(warpMask & warpBits);
	if(-1 != start) start += ~31 & tid;
	else start = delta_shared[NumWarps + warp];
	__syncthreads();

	return tid - start;
}

////////////////////////////////////////////////////////////////////////////////
// CTASegScan

template<int NT, typename _Op = mgpu::plus<int> >
struct CTASegScan {
	typedef _Op Op;
	typedef typename Op::result_type T;
	enum { NumWarps = NT / 32, Size = NT, Capacity = 2 * NT };
	union Storage {
		int delta[NumWarps];
		T values[Capacity];
	};

	// Each thread passes the reduction of the LAST SEGMENT that it covers.
	// flag is set to true if there's at least one segment flag in the thread.
	// SegScan returns the reduction of values for the first segment in this
	// thread over the preceding threads.
	// Return the value init for the first thread.

	// When scanning single elements per thread, interpret the flag as a BEGIN
	// FLAG. If tid's flag is set, its value belongs to thread tid + 1, not
	// thread tid.

	// The function returns the reduction of the last segment in the CTA.

	MGPU_DEVICE static T SegScanDelta(int tid, int tidDelta, T x,
		Storage& storage, T* carryOut, T identity = (T)0, Op op = Op()) {

		// Run an inclusive scan
		int first = 0;
		storage.values[first + tid] = x;
		__syncthreads();

		#pragma unroll
		for(int offset = 1; offset < NT; offset += offset) {
			if(tidDelta >= offset)
				x = op(storage.values[first + tid - offset], x);
			first = NT - first;
			storage.values[first + tid] = x;
			__syncthreads();
		}

		// Get the exclusive scan.
		x = tid ? storage.values[first + tid - 1] : identity;
		*carryOut = storage.values[first + NT - 1];
		__syncthreads();
		return x;
	}

	MGPU_DEVICE static T SegScan(int tid, T x, bool flag, Storage& storage,
		T* carryOut, T identity = (T)0, Op op = Op()) {

		// Find the left-most thread that covers the first segment of this
		// thread.
		int tidDelta = DeviceFindSegScanDelta<NT>(tid, flag, storage.delta);

		return SegScanDelta(tid, tidDelta, x, storage, carryOut, identity, op);
	}
};

} // namespace mgpu


================================================
FILE: 3rdparty/ctc_include/contrib/moderngpu/include/device/ctasegsort.cuh
================================================
/******************************************************************************
 * Copyright (c) 2013, NVIDIA CORPORATION.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *     * Redistributions of source code must retain the above copyright
 *       notice, this list of conditions and the following disclaimer.
 *     * Redistributions in binary form must reproduce the above copyright
 *       notice, this list of conditions and the following disclaimer in the
 *       documentation and/or other materials provided with the distribution.
 *     * Neither the name of the NVIDIA CORPORATION nor the
 *       names of its contributors may be used to endorse or promote products
 *       derived from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 ******************************************************************************/

/******************************************************************************
 *
 * Code and text by Sean Baxter, NVIDIA Research
 * See http://nvlabs.github.io/moderngpu for repository and documentation.
 *
 ******************************************************************************/

#pragma once

#include "ctascan.cuh"
#include "ctasearch.cuh"
#include "loadstore.cuh"
#include "sortnetwork.cuh"

namespace mgpu {

template<int VT, typename T, typename Comp>
MGPU_DEVICE void SegmentedSerialMerge(const T* keys_shared, int aBegin,
	int aEnd, int bBegin, int bEnd, T results[VT], int indices[VT],
	int leftEnd, int rightStart, Comp comp, bool sync = true) {

	bEnd = min(rightStart, bEnd);
	T aKey = keys_shared[aBegin];
	T bKey = keys_shared[bBegin];

	#pragma unroll
	for(int i = 0; i < VT; ++i) {
		bool p;

		// If A has run out of inputs, emit B.
		if(aBegin >= aEnd)
			p = false;
		else if(bBegin >= bEnd || aBegin < leftEnd)
			// B has hit the end of the middle segment.
			// Emit A if A has inputs remaining in the middle segment.
			p = true;
		else
			// Emit the smaller element in the middle segment.
			p = !comp(bKey, aKey);

		results[i] = p ? aKey : bKey;
		indices[i] = p ? aBegin : bBegin;
		if(p) aKey = keys_shared[++aBegin];
		else bKey = keys_shared[++bBegin];
	}
	if(sync) { __syncthreads(); }
}

////////////////////////////////////////////////////////////////////////////////
// CTASegsortPass

template<int NT, int VT, typename T, typename Comp>
MGPU_DEVICE void CTASegsortPass(T* keys_shared, int* ranges_shared, int tid,
	int pass, T results[VT], int indices[VT], int2& activeRange, Comp comp) {

	// Locate the intervals of the input lists.
	int3 frame = FindMergesortFrame(2<< pass, tid, VT);
	int a0 = frame.x;
	int b0 = frame.y;
	int listLen = frame.z;
	int list = tid>> pass;
	int listParity = 1 & list;
	int diag = VT * tid - frame.x;

	// Fetch the active range for the list this thread's list is merging with.
	int siblingRange = ranges_shared[1 ^ list];
	int siblingStart = 0x0000ffff & siblingRange;
	int siblingEnd = siblingRange>> 16;

	// Create a new active range for the merge.
	int leftEnd = listParity ? siblingEnd : activeRange.y;
	int rightStart = listParity ? activeRange.x : siblingStart;
	activeRange.x = min(activeRange.x, siblingStart);
	activeRange.y = max(activeRange.y, siblingEnd);

	int p = SegmentedMergePath(keys_shared, a0, listLen, b0, listLen, leftEnd,
		rightStart, diag, comp);

	int a0tid = a0 + p;
	int b0tid = b0 + diag - p;
	SegmentedSerialMerge<VT>(keys_shared, a0tid, b0, b0tid, b0 + listLen,
		results, indices, leftEnd, rightStart, comp);

	// Store the ranges to shared memory.
	if(0 == diag)
		ranges_shared[list>> 1] =
			(int)bfi(activeRange.y, activeRange.x, 16, 16);
}

////////////////////////////////////////////////////////////////////////////////
// CTASegsortLoop

template<int NT, int VT, bool HasValues, typename KeyType, typename ValType,
	typename Comp>
MGPU_DEVICE int2 CTASegsortLoop(KeyType threadKeys[VT],
	ValType threadValues[VT], KeyType* keys_shared, ValType* values_shared,
	int* ranges_shared, int tid, int2 activeRange, Comp comp) {

	const int NumPasses = sLogPow2<NT>::value;
	#pragma unroll
	for(int pass = 0; pass < NumPasses; ++pass) {
		int indices[VT];
		CTASegsortPass<NT, VT>(keys_shared, ranges_shared, tid, pass,
			threadKeys, indices, activeRange, comp);

		if(HasValues) {
			// Exchange values through shared memory.
			DeviceThreadToShared<VT>(threadValues, tid, values_shared);
			DeviceGather<NT, VT>(NT * VT, values_shared, indices, tid,
				threadValues);
		}

		// Store results in shared memory in sorted order.
		DeviceThreadToShared<VT>(threadKeys, tid, keys_shared);
	}
	return activeRange;
}

////////////////////////////////////////////////////////////////////////////////
// CTASegsort
// Pass keys and values in register. On return, values are returned in register
// and keys returned in shared memory.

template<int NT, int VT, bool Stable, bool HasValues, typename KeyType,
	typename ValType, typename Comp>
MGPU_DEVICE int2 CTASegsort(KeyType threadKeys[VT], ValType threadValues[VT],
	int tid, int headFlags, KeyType* keys_shared, ValType* values_shared,
	int* ranges_shared, Comp comp) {

	if(Stable)
		// Odd-even transpose sort.
		OddEvenTransposeSortFlags<VT>(threadKeys, threadValues, headFlags,
			comp);
	else
		// Batcher's odd-even mergesort.
		OddEvenMergesortFlags<VT>(threadKeys, threadValues, headFlags, comp);

	// Record the first and last occurrence of head flags in this segment.
	int blockEnd = 31 - clz(headFlags);
	if(-1 != blockEnd) blockEnd += VT * tid;

	int blockStart = ffs(headFlags);
	blockStart = blockStart ? (VT * tid - 1 + blockStart) : (NT * VT);

	ranges_shared[tid] = (int)bfi(blockEnd, blockStart, 16, 16);

	// Store back to shared mem. The values are in VT-length sorted lists.
	// These are merged recursively.
	DeviceThreadToShared<VT>(threadKeys, tid, keys_shared);

	int2 activeRange = CTASegsortLoop<NT, VT, HasValues>(threadKeys,
		threadValues, keys_shared, values_shared, ranges_shared, tid,
		make_int2(blockStart, blockEnd), comp);
	return activeRange;
}


template<int NT, int VT, bool Stable, typename KeyType, typename Comp>
MGPU_DEVICE int2 CTASegsortKeys(KeyType threadKeys[VT], int tid, int headFlags,
	KeyType* keys_shared, int* ranges_shared, Comp comp) {

	int valuesTemp[VT];
	return CTASegsort<NT, VT, Stable, false>(threadKeys, valuesTemp, tid,
		headFlags, keys_shared, (int*)keys_shared, ranges_shared, comp);
}

template<int NT, int VT, bool Stable, typename KeyType, typename ValType,
	typename Comp>
MGPU_DEVICE int2 CTASegsortPairs(KeyType threadKeys[VT],
	ValType threadValues[VT], int tid, int headFlags, KeyType* keys_shared,
	ValType* values_shared, int* ranges_shared, Comp comp) {

	return CTASegsort<NT, VT, Stable, true>(threadKeys, threadValues, tid,
		headFlags, keys_shared, values_shared, ranges_shared, comp);
}

////////////////////////////////////////////////////////////////////////////////
// DeviceSegBlocksort
// Load keys and values from global memory, sort in shared memory, and store
// back to global memory. Store the left-most and right-most encountered
// headflag locations to ranges_global to prepare for the next pass.
// This function is factored out of the blocksort kernel to allow easier
// customization of that kernel - we have two implementations currently:
// sort over indices and sort over bitfield.

template<int NT, int VT, bool Stable, bool HasValues, typename InputIt1,
	typename InputIt2, typename KeyType, typename ValType, typename OutputIt1,
	typename OutputIt2, typename Comp>
MGPU_DEVICE void DeviceSegBlocksort(InputIt1 keys_global,
	InputIt2 values_global, int count2, KeyType* keys_shared,
	ValType* values_shared, int* ranges_shared, int headFlags, int tid,
	int block, OutputIt1 keysDest_global, OutputIt2 valsDest_global,
	int* ranges_global, Comp comp) {

	// Load keys into register in thread order.
	int gid = NT * VT * block;
	KeyType threadKeys[VT];
	DeviceGlobalToShared<NT, VT>(count2, keys_global + gid, tid, keys_shared);
	DeviceSharedToThread<VT>(keys_shared, tid, threadKeys);

	// Load the values from global memory and into register in thread order.
	ValType threadValues[VT];
	if(HasValues) {
		DeviceGlobalToShared<NT, VT>(count2, values_global + gid, tid,
			values_shared);
		DeviceSharedToThread<VT>(values_shared, tid, threadValues);
	}

	// Run the CTA segmented blocksort.
	int2 activeRange = CTASegsort<NT, VT, Stable, HasValues>(threadKeys,
		threadValues, tid, headFlags, keys_shared, values_shared, ranges_shared,
		comp);

	// Store the keys to global memory.
	DeviceSharedToGlobal<NT, VT>(count2, keys_shared, tid,
		 keysDest_global + gid);

	if(HasValues) {
		// Store the values to global memory.xk b
		DeviceThreadToShared<VT>(threadValues, tid, values_shared);
		DeviceSharedToGlobal<NT, VT>(count2, values_shared, tid,
			valsDest_global + gid, false);
	}

	// Store the 16-bit packed ranges. These are used by all merge kernels and
	// the first level of global segmented merge path partitioning.
	if(!tid)
		ranges_global[block] = bfi(activeRange.y, activeRange.x, 16, 16);
}

////////////////////////////////////////////////////////////////////////////////
// DeviceIndicesToHeadFlags
// Load indices from an array and cooperatively turn into a head flag bitfield
// for each thread.

template<int NT, int VT>
MGPU_DEVICE int DeviceIndicesToHeadFlags(const int* indices_global,
	const int* partitions_global, int tid, int block, int count2,
	int* words_shared, byte* flags_shared) {

	const int FlagWordsPerThread = MGPU_DIV_UP(VT, 4);
	int gid = NT * VT * block;
	int p0 = partitions_global[block];
	int p1 = partitions_global[block + 1];

	int headFlags = 0;
	if(p1 > p0 || count2 < NT * VT) {

		// Clear the flag bytes, then loop through the indices and poke in flag
		// values.
		#pragma unroll
		for(int i = 0; i < FlagWordsPerThread; ++i)
			words_shared[NT * i + tid] = 0;
		__syncthreads();

		for(int index = p0 + tid; index < p1; index += NT) {
			int headFlag = indices_global[index];
			flags_shared[headFlag - gid] = 1;
		}
		__syncthreads();

		// Combine all the head flags for this thread.
		int first = VT * tid;
		int offset = first / 4;
		int prev = words_shared[offset];
		int mask = 0x3210 + 0x1111 * (3 & first);
		#pragma unroll
		for(int i = 0; i < FlagWordsPerThread; ++i) {
			// Gather the next four flags.
			int next = words_shared[offset + 1 + i];
			int x = prmt(prev, next, mask);
			prev = next;

			// Set the head flag bits.
			if(0x00000001 & x) headFlags |= 1<< (4 * i);
			if(0x00000100 & x) headFlags |= 1<< (4 * i + 1);
			if(0x00010000 & x) headFlags |= 1<< (4 * i + 2);
			if(0x01000000 & x) headFlags |= 1<< (4 * i + 3);
		}
		__syncthreads();

		// Set head flags for out-of-range keys.
		int outOfRange = min(VT, first + VT - count2);
		if(outOfRange > 0)
			headFlags = bfi(0xffffffff, headFlags, VT - outOfRange, outOfRange);

		// Clear head flags above VT.
		headFlags &= (1<< VT) - 1;
	}
	return headFlags;
}

////////////////////////////////////////////////////////////////////////////////
// SegSortSupport

struct SegSortSupport {
	int* ranges_global;
	int2* ranges2_global;

	int4* mergeList_global;
	int* copyList_global;
	int2* queueCounters_global;
	int2* nextCounters_global;

	byte* copyStatus_global;
};

////////////////////////////////////////////////////////////////////////////////
// DeviceSegSortMerge

template<int NT, int VT, bool HasValues, typename KeyType, typename ValueType,
	typename Comp>
MGPU_DEVICE void DeviceSegSortMerge(const KeyType* keys_global,
	const ValueType* values_global, int2 segmentRange, int tid,
	int block, int4 range, int pass, KeyType* keys_shared,
	int* indices_shared, KeyType* keysDest_global, ValueType* valsDest_global,
	Comp comp) {

	const int NV = NT * VT;
	int gid = NV * block;

	// Load the local compressed segment indices.
	int a0 = range.x;
	int aCount = range.y - range.x;
	int b0 = range.z;
	int bCount = range.w - range.z;

	DeviceLoad2ToShared<NT, VT, VT>(keys_global + a0, aCount, keys_global + b0,
		bCount, tid, keys_shared);

	////////////////////////////////////////////////////////////////////////////
	// Run a merge path to find the starting point for each thread to merge.
	// If the entire warp fits into the already-sorted segments, we can skip
	// sorting it and leave its keys in shared memory. Doing this on the warp
	// level rather than thread level (also legal) gives slightly better
	// performance.

	int segStart = segmentRange.x;
	int segEnd = segmentRange.y;
	int listParity = 1 & (block>> pass);

	int warpOffset = VT * (~31 & tid);
	bool sortWarp = listParity ?
		// The spliced segment is to the left (segStart).
		(warpOffset < segStart) :
		// The spliced segment is to the right (segEnd).
		(warpOffset + 32 * VT > segEnd);

	KeyType threadKeys[VT];
	int indices[VT];
	if(sortWarp) {
		int diag = VT * tid;
		int mp = SegmentedMergePath(keys_shared, 0, aCount, aCount, bCount,
			listParity ? 0 : segEnd, listParity ? segStart : NV, diag, comp);
		int a0tid = mp;
		int a1tid = aCount;
		int b0tid = aCount + diag - mp;
		int b1tid = aCount + bCount;

		// Serial merge into register. All threads in the CTA so we hoist the
		// check for list parity outside the function call to simplify the
		// logic. Unlike in the blocksort, this does not cause warp divergence.
		SegmentedSerialMerge<VT>(keys_shared, a0tid, a1tid, b0tid, b1tid,
			threadKeys, indices, listParity ? 0 : segEnd,
			listParity ? segStart : NV, comp, false);
	}
	__syncthreads();

	// Store sorted data in register back to shared memory. Then copy to global.
	if(sortWarp)
		DeviceThreadToShared<VT>(threadKeys, tid, keys_shared, false);
	__syncthreads();

	DeviceSharedToGlobal<NT, VT>(aCount + bCount, keys_shared, tid,
		keysDest_global + gid);

	////////////////////////////////////////////////////////////////////////////
	// Use the merge indices to gather values from global memory. Store directly
	// to valsDest_global.

	if(HasValues) {
		// Transpose the gather indices to help coalesce loads.
		if(sortWarp)
			DeviceThreadToShared<VT>(indices, tid, indices_shared, false);
		else {
			#pragma unroll
			for(int i = 0; i < VT; ++i)
				indices_shared[VT * tid + i] = VT * tid + i;
		}
		__syncthreads();

		DeviceTransferMergeValuesShared<NT, VT>(aCount + bCount,
			values_global + a0,  values_global + b0, aCount, indices_shared,
			tid, valsDest_global + NV * block);
	}
}

////////////////////////////////////////////////////////////////////////////////
// DeviceSegSortCopy

template<int NT, int VT, bool HasValues, typename KeyType, typename ValueType>
MGPU_DEVICE void DeviceSegSortCopy(const KeyType* keys_global,
	const ValueType* values_global, int tid, int block, int count,
	KeyType* keysDest_global, ValueType* valsDest_global) {

	int gid = NT * VT * block;
	int count2 = min(NT * VT, count - gid);

	DeviceGlobalToGlobal<NT, VT>(count2, keys_global + gid, tid,
		keysDest_global + gid);
	if(HasValues)
		DeviceGlobalToGlobal<NT, VT>(count2, values_global + gid, tid,
			valsDest_global + gid);
}

} // namespace mgpu


================================================
FILE: 3rdparty/ctc_include/contrib/moderngpu/include/device/ctasortedsearch.cuh
================================================
/******************************************************************************
 * Copyright (c) 2013, NVIDIA CORPORATION.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *     * Redistributions of source code must retain the above copyright
 *       notice, this list of conditions and the following disclaimer.
 *     * Redistributions in binary form must reproduce the above copyright
 *       notice, this list of conditions and the following disclaimer in the
 *       documentation and/or other materials provided with the distribution.
 *     * Neither the name of the NVIDIA CORPORATION nor the
 *       names of its contributors may be used to endorse or promote products
 *       derived from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 ******************************************************************************/

/******************************************************************************
 *
 * Code and text by Sean Baxter, NVIDIA Research
 * See http://nvlabs.github.io/moderngpu for repository and documentation.
 *
 ******************************************************************************/

#pragma once

#include "../mgpudevice.cuh"
#include "ctasearch.cuh"

namespace mgpu {


////////////////////////////////////////////////////////////////////////////////
// DeviceSerialSearch

template<int VT, MgpuBounds Bounds, bool RangeCheck, bool IndexA, bool MatchA,
	bool IndexB, bool MatchB, typename T, typename Comp>
MGPU_DEVICE int3 DeviceSerialSearch(const T* keys_shared, int aBegin,
	int aEnd, int bBegin, int bEnd, int aOffset, int bOffset, int* indices,
	Comp comp) {

	const int FlagA = IndexA ? 0x80000000 : 1;
	const int FlagB = IndexB ? 0x80000000 : 1;

	T aKey = keys_shared[aBegin];
	T bKey = keys_shared[bBegin];
	T aPrev, bPrev;
	if(aBegin > 0) aPrev = keys_shared[aBegin - 1];
	if(bBegin > 0) bPrev = keys_shared[bBegin - 1];
	int decisions = 0;
	int matchCountA = 0;
	int matchCountB = 0;

	#pragma unroll
	for(int i = 0; i < VT; ++i) {
		bool p;
		if(RangeCheck && aBegin >= aEnd) p = false;
		else if(RangeCheck && bBegin >= bEnd) p = true;
		else p = (MgpuBoundsUpper == Bounds) ?
			comp(aKey, bKey) :
			!comp(bKey, aKey);

		if(p) {
			// aKey is smaller than bKey, so it is inserted before bKey.
			// Save bKey's index (bBegin + first) as the result of the search
			// and advance to the next needle in A.
			bool match = false;
			if(MatchA) {
				// Test if there is an element in B that matches aKey.
				if(MgpuBoundsUpper == Bounds) {
					// Upper Bound: We're inserting aKey after bKey. If there
					// is a match for aKey it must be bPrev. Check that bPrev
					// is in range and equal to aKey.
					// The predicate test result !comp(aKey, bPrev) was
					// established on the previous A-advancing iteration (it
					// failed the comp(aKey, bKey) test to get us to this
					// point). Check the other half of the equality condition
					// with a second comparison.
					bool inRange = !RangeCheck || (bBegin > aEnd);
					match = inRange && !comp(bPrev, aKey);
				} else {
					// Lower Bound: We're inserting aKey before bKey. If there
					// is a match for aKey, it must be bKey. Check that bKey
					// is in range and equal to aKey.
					// The predicate test !comp(bKey, aKey) has established one
					// half of the equality condition. We establish the other
					// half with a second comparison.
					bool inRange = !RangeCheck || (bBegin < bEnd);
					match = inRange && !comp(aKey, bKey);
				}
			}

			int index = 0;
		 	if(IndexA) index = bOffset + bBegin;
			if(match) index |= FlagA;
			if(IndexA || MatchA) indices[i] = index;
			matchCountA += match;

			// Mark the decision bit to indicate that this iteration has
			// progressed A (the needles).
			decisions |= 1<< i;
			aPrev = aKey;
			aKey = keys_shared[++aBegin];
		} else {
			// aKey is larger than bKey, so it is inserted after bKey (but we
			// don't know where yet). Advance the B index to the next element in
			// the haystack to continue the search for the current needle.
			bool match = false;
			if(MatchB) {
				if(MgpuBoundsUpper == Bounds) {
					// Upper Bound: aKey is not smaller than bKey. We advance to
					// the next haystack element in B. If there is a match in A
					// for bKey it must be aKey. By entering this branch we've
					// verified that !comp(aKey, bKey). Making the reciprocal
					// comparison !comp(bKey, aKey) establishes aKey == bKey.
					bool inRange = !RangeCheck ||
						((bBegin < bEnd) && (aBegin < aEnd));
					match = inRange && !comp(bKey, aKey);
				} else {
					// Lower Bound: bKey is smaller than aKey. We advance to the
					// next element in B. If there is a match for bKey, it must
					// be aPrev. The previous A-advancing iteration proved that
					// !comp(bKey, aPrev). We test !comp(aPrev, bKey) for the
					// other half of the equality condition.
					bool inRange = !RangeCheck ||
						((bBegin < bEnd) && (aBegin > 0));
					match = inRange && !comp(aPrev, bKey);
				}
			}

			int index = 0;
			if(IndexB) index = aOffset + aBegin;
			if(match) index |= FlagB;
			if(IndexB || MatchB) indices[i] = index;
			matchCountB += match;

			// Keep the decision bit cleared to indicate that this iteration
			// has progressed B (the haystack).
			bPrev = bKey;
			bKey = keys_shared[++bBegin];
		}
	}
	return make_int3(decisions, matchCountA, matchCountB);
}

////////////////////////////////////////////////////////////////////////////////
// CTASortedSearch
// Take keys in shared memory and return indices and b-match flags in shared
// memory.
// NOTE: This function doesn't do any strided-to-thread order transposes so
// using an even number of values per thread will incur no additional bank
// conflicts.

template<int NT, int VT, MgpuBounds Bounds, bool IndexA, bool MatchA,
	bool IndexB, bool MatchB, typename T, typename Comp>
MGPU_DEVICE int2 CTASortedSearch(T* keys_shared, int aStart, int aCount,
	int aEnd, int a0, int bStart, int bCount, int bEnd, int b0, bool extended,
	int tid, int* indices_shared, Comp comp) {

	// Run a merge path to find the start of the serial search for each thread.
	int diag = VT * tid;
	int mp = MergePath<Bounds>(keys_shared + aStart, aCount,
		keys_shared + bStart, bCount, diag, comp);
	int a0tid = mp;
	int b0tid = diag - mp;

	// Serial search into register.
	int3 results;
	int indices[VT];
	if(extended)
		results = DeviceSerialSearch<VT, Bounds, false, IndexA, MatchA, IndexB,
			MatchB>(keys_shared, a0tid + aStart, aEnd, b0tid + bStart, bEnd,
			a0 - aStart, b0 - bStart, indices, comp);
	else
		results = DeviceSerialSearch<VT, Bounds, true, IndexA, MatchA, IndexB,
			MatchB>(keys_shared, a0tid + aStart, aEnd, b0tid + bStart, bEnd,
			a0 - aStart, b0 - bStart, indices, comp);
	__syncthreads();

	// Compact the indices into shared memory. Use the decision bits (set is A,
	// cleared is B) to select the destination.
	int decisions = results.x;
	b0tid += aCount;
	#pragma unroll
	for(int i = 0; i < VT; ++i) {
		if((1<< i) & decisions) {
			if(IndexA || MatchA) indices_shared[a0tid++] = indices[i];
		} else {
			if(IndexB || MatchB) indices_shared[b0tid++] = indices[i];
		}
	}
	__syncthreads();

	// Return the match counts for A and B keys.
	return make_int2(results.y, results.z);
}

} // namespace mgpu


================================================
FILE: 3rdparty/ctc_include/contrib/moderngpu/include/device/devicetypes.cuh
================================================
/******************************************************************************
 * Copyright (c) 2013, NVIDIA CORPORATION.  All rights reserved.
 * 
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *     * Redistributions of source code must retain the above copyright
 *       notice, this list of conditions and the following disclaimer.
 *     * Redistributions in binary form must reproduce the above copyright
 *       notice, this list of conditions and the following disclaimer in the
 *       documentation and/or other materials provided with the distribution.
 *     * Neither the name of the NVIDIA CORPORATION nor the
 *       names of its contributors may be used to endorse or promote products
 *       derived from this software without specific prior written permission.
 * 
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
 * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 ******************************************************************************/

/******************************************************************************
 *
 * Code and text by Sean Baxter, NVIDIA Research
 * See http://nvlabs.github.io/moderngpu for repository and documentation.
 *
 ******************************************************************************/

#pragma once

#if __CUDA_ARCH__ == 100
	#error "COMPUTE CAPABILITY 1.0 NOT SUPPORTED BY MPGU. TRY 2.0!"
#endif 

#include <climits>
#include "../util/static.h"

#ifdef _MSC_VER
#define INLINESYMBOL __forceinline__
#else
#define INLINESYMBOL inline
#endif

namespace mgpu {

#define MGPU_HOST __host__ INLINESYMBOL
#define MGPU_DEVICE __device__ INLINESYMBOL
#define MGPU_HOST_DEVICE __host__ __device__ INLINESYMBOL

const int WARP_SIZE = 32;
const int LOG_WARP_SIZE = 5;

////////////////////////////////////////////////////////////////////////////////
// Device-side comparison operators

template<typename T>
struct less : public std::binary_function<T, T, bool> {
	MGPU_HOST_DEVICE bool operator()(T a, T b) { return a < b; }
};
template<typename T>
struct less_equal : public std::binary_function<T, T, bool> {
	MGPU_HOST_DEVICE bool operator()(T a, T b) { return a <= b; }
};
template<typename T>
struct greater : public std::binary_function<T, T, bool> {
	MGPU_HOST_DEVICE bool operator()(T a, T b) { return a > b; }
};
template<typename T>
struct greater_equal : public std::binary_function<T, T, bool> {
	MGPU_HOST_DEVICE bool operator()(T a, T b) { return a >= b; }
};
template<typename T>
struct equal_to : public std::binary_function<T, T, bool> {
	MGPU_HOST_DEVICE bool operator()(T a, T b) { return a == b; }
};
template<typename T>
struct not_equal_to : public std::binary_function<T, T, bool> {
	MGPU_HOST_DEVICE bool operator()(T a, T b) { return a != b; }
};

////////////////////////////////////////////////////////////////////////////////
// Device-side arithmetic operators

template<typename T>
struct plus : public std::binary_function<T, T, T> {
	MGPU_HOST_DEVICE T operator()(T a, T b) { return a + b; }
};

template<typename T>
struct minus : public std::binary_function<T, T, T> {
	MGPU_HOST_DEVICE T operator()(T a, T b) { return a - b; }
};

template<typename T>
struct multiplies : public std::binary_function<T, T, T> {
	MGPU_HOST_DEVICE T operator()(T a, T b) { return a * b; }
};

template<typename T>
struct modulus : public std::binary_function<T, T, T> {
	MGPU_HOST_DEVICE T operator()(T a, T b) { return a % b; }
};

template<typename T>
struct bit_or : public std::binary_function<T, T, T> {
	MGPU_HOST_DEVICE T operator()(T a, T b) { return a | b; }
};

template<typename T>
struct bit_and : public std::binary_function<T, T, T> {
	MGPU_HOST_DEVICE T operator()(T a, T b) { return a & b; }
};

template<typename T>
struct bit_xor : public std::binary_function<T, T, T> {
	MGPU_HOST_DEVICE T operator()(T a, T b) { return a ^ b; }
};

template<typename T>
struct maximum : public std::binary_function<T, T, T> {
	MGPU_HOST_DEVICE T operator()(T a, T b) { return max(a, b); }
};

template<typename T>
struct minimum : public std::binary_function<T, T, T> {
	MGPU_HOST_DEVICE T operator()(T a, T b) { return min(a, b); }
};

////////////////////////////////////////////////////////////////////////////////

template<typename T>
MGPU_HOST_DEVICE void swap(T& a, T& b) {
	T c = a;
	a = b;
	b = c;
}

template<typename T>
struct DevicePair {
	T x, y;
};

template<typename T>
MGPU_HOST_DEVICE DevicePair<T> MakeDevicePair(T x, T y) {
	DevicePair<T> p = { x, y };
	return p;
}

template<typename T> struct numeric_limits;
template<> struct numeric_limits<int> {
	MGPU_HOST_DEVICE static int min() { return INT_MIN; }
	MGPU_HOST_DEVICE static int max() { return INT_MAX; }
	MGPU_HOST_DEVICE static int lowest() { return INT_MIN; }
	MGPU_HOST_DEVICE static int AddIdent() { return 0; }
	MGPU_HOST_DEVICE static int MulIdent() { return 1; }
};
template<> struct numeric_limits<long long> {
	MGPU_HOST_DEVICE static long long min() { return LLONG_MIN; }
	MGPU_HOST_DEVICE static long long max() { return LLONG_MAX; }
	MGPU_HOST_DEVICE static long long lowest() { return LLONG_MIN; }
	MGPU_HOST_DEVICE static long long AddIdent() { return 0; }
	MGPU_HOST_DEVICE static long long MulIdent() { return 1; }
};
template<> struct numeric_limits<uint> {
	MGPU_HOST_DEVICE static uint min() { return 0; }
	MGPU_HOST_DEVICE static uint max() { return UINT_MAX; }
	MGPU_HOST_DEVICE static uint lowest() { return 0; }
	MGPU_HOST_DEVICE static uint AddIdent() { return 0; }
	MGPU_HOST_DEVICE static uint MulIdent() { return 1; }
};
template<> struct numeric_limits<unsigned long long> {
	MGPU_HOST_DEVICE static unsigned long long min() { return 0; }
	MGPU_HOST_DEVICE static unsigned long long max() { return ULLONG_MAX; }
	MGPU_HOST_DEVICE static unsigned long long lowest() { return 0; }
	MGPU_HOST_DEVICE static unsigned long long AddIdent() { return 0; }
	MGPU_HOST_DEVICE static unsigned long long MulIdent() { return 1; }
};
template<> struct numeric_limits<float> {
	MGPU_HOST_DEVICE static float min() { return FLT_MIN; }
	MGPU_HOST_DEVICE static float max() { return FLT_MAX; }
	MGPU_HOST_DEVICE static float lowest() { return -FLT_MAX; }
	MGPU_HOST_DEVICE static float AddIdent() { return 0; }
	MGPU_HOST_DEVICE static float MulIdent() { return 1; }
};
template<> struct numeric_limits<double> {
	MGPU_HOST_DEVICE static double min() { return DBL_MIN; }
	MGPU_HOST_DEVICE static double max() { return DBL_MAX; }
	MGPU_HOST_DEVICE static double lowest() { return -DBL_MAX; }
	MGPU_HOST_DEVICE static double AddIdent() { return 0; }
	MGPU_HOST_DEVICE static double MulIdent() { return 1; }
};


MGPU_HOST_DEVICE int2 operator+(int2 a, int2 b) {
	return make_int2(a.x + b.x, a.y + b.y); 
}
MGPU_HOST_DEVICE int2& operator+=(int2& a, int2 b) {
	a = a + b;
	return a;
}
MGPU_HOST_DEVICE int2 operator*(int2 a, int2 b) {
	return make_int2(a.x * b.x, a.y * b.y);
}
MGPU_HOST_DEVICE int2& operator*=(int2& a, int2 b) {
	a = a * b;
	return a;
}

template<typename T>
MGPU_HOST_DEVICE T max(T a, T b) {
#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ < 100)
	return std::max(a, b);
#else
	return (a < b) ? b : a;
#endif
}
template<typename T>
MGPU_HOST_DEVICE T min(T a, T b) {
#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ < 100)
	return std::min(a, b);
#else
	return (b < a) ? b : a;
#endif
}

MGPU_HOST_DEVICE int2 max(int2 a, int2 b) {
	return make_int2(max(a.x, b.x), max(a.y, b.y));
}

MGPU_HOST_DEVICE int2 min(int2 a, int2 b) {
	return make_int2(min(a.x, b.x), min(a.y, b.y));
}

template<> struct numeric_limits<int2> {
	MGPU_HOST_DEVICE static int2 min() { return make_int2(INT_MIN, INT_MIN); }
	MGPU_HOST_DEVICE static int2 max() { return make_int2(INT_MAX, INT_MAX); }
	MGPU_HOST_DEVICE static int2 lowest() { 
		return make_int2(INT_MIN, INT_MIN); 
	}
	MGPU_HOST_DEVICE static int2 AddIdent() { return make_int2(0, 0); }
	MGPU_HOST_DEVICE static int2 MulIdent() { return make_int2(1, 1); }
};

template<typename T>
class constant_iterator : public std::iterator_traits<const T*> {
public:
	MGPU_HOST_DEVICE constant_iterator(T value) : _value(value) { }

	MGPU_HOST_DEVICE T operator[](ptrdiff_t i) const { 
		return _value;
	}
	MGPU_HOST_DEVICE T operator*() const {
		return _value;
	}
	MGPU_HOST_DEVICE constant_iterator operator+(ptrdiff_t diff) const {
		return constant_iterator(_value);
	}
	MGPU_HOST_DEVICE constant_iterator operator-(ptrdiff_t diff) const {
		return constant_iterator(_value);
	}
	MGPU_HOST_DEVICE constant_iterator& operator+=(ptrdiff_t diff) {
		return *this;
	}
	MGPU_HOST_DEVICE constant_iterator& operator-=(ptrdiff_t diff) {
		return *this;
	}
private:
	T _value;
};

template<typename T>
class counting_iterator : public std::iterator_traits<const T*> {
public:
	MGPU_HOST_DEVICE counting_iterator(T value) : _value(value) { }

	MGPU_HOST_DEVICE T operator[](ptrdiff_t i) { 
		return _value + i;
	}
	MGPU_HOST_DEVICE T operator*() {
		return _value;
	}
	MGPU_HOST_DEVICE counting_iterator operator+(ptrdiff_t diff) {
		return counting_iterator(_value + diff);
	}
	MGPU_HOST_DEVICE counting_iterator operator-(ptrdiff_t diff) {
		return counting_iterator(_value - diff);
	}
	MGPU_HOST_DEVICE counting_iterator& operator+=(ptrdiff_t diff) {
		_value += diff;
		return *this;
	}
	MGPU_HOST_DEVICE counting_iterator& operator-=(ptrdiff_t diff) {
		_value -= diff;
		return *this;
	}
private:
	T _value;
};

template<typename T>
class step_iterator : public std::iterator_traits<const T*> {
public:
	MGPU_HOST_DEVICE step_iterator(T base, T step) :
		_base(base), _step(step), _offset(0) { }

	MGPU_HOST_DEVICE T operator[](ptrdiff_t i) { 
		return _base + (_offset + i) * _step; 
	}
	MGPU_HOST_DEVICE T operator*() { 
		return _base + _offset * _step; 
	} 
	MGPU_HOST_DEVICE step_iterator operator+(ptrdiff_t diff) {
		step_iterator it = *this;
		it._offset += diff;
		return it;
	}
	MGPU_HOST_DEVICE step_iterator operator-(ptrdiff_t diff) {
		step_iterator it = *this;
		it._offset -= diff;
		return it;
	}
	MGPU_HOST_DEVICE step_iterator& operator+=(ptrdiff_t diff) { 
		_offset += diff;
		return *this;
	}
	MGPU_HOST_DEVICE step_iterator& operator-=(ptrdiff_t diff) { 
		_offset -= diff;
		return *this;
	}
private:
	ptrdiff_t _offset;
	T _base, _step;	
};

} // namespace mgpu


template<typename T>
MGPU_HOST_DEVICE mgpu::counting_iterator<T> operator+(ptrdiff_t diff,
	mgpu::counting_iterator<T> it) {
	return it + diff;
}
template<typename T>
MGPU_HOST_DEVICE mgpu::counting_iterator<T> operator-(ptrdiff_t diff,
	mgpu::counting_iterator<T> it) {
	return it + (-diff);
}
template<typename T>
MGPU_HOST_DEVICE mgpu::step_iterator<T> operator+(ptrdiff_t diff, 
	mgpu::step_iterator<T> it) {
	return it + diff;
}
template<typename T>
MGPU_HOST_DEVICE mgpu::step_iterator<T> operator-(ptrdiff_t diff, 
	mgpu::step_iterator<T> it) {
	return it + (-diff);
}


================================================
FILE: 3rdparty/ctc_include/contrib/moderngpu/include/device/deviceutil.cuh
================================================
/******************************************************************************
 * Copyright (c) 2013, NVIDIA CORPORATION.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *     * Redistributions of source code must retain the above copyright
 *       notice, this list of conditions and the following disclaimer.
 *     * Redistributions in binary form must reproduce the above copyright
 *       notice, this list of conditions and the following disclaimer in the
 *       documentation and/or other materials provided with the distribution.
 *     * Neither the name of the NVIDIA CORPORATION nor the
 *       names of its contributors may be used to endorse or promote products
 *       derived from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 ******************************************************************************/

/******************************************************************************
 *
 * Code and text by Sean Baxter, NVIDIA Research
 * See http://nvlabs.github.io/moderngpu for repository and documentation.
 *
 ******************************************************************************/

#pragma once

#include "intrinsics.cuh"

namespace mgpu {

// Get the difference between two pointers in bytes.
MGPU_HOST_DEVICE ptrdiff_t PtrDiff(const void* a, const void* b) {
	return (const byte*)b - (const byte*)a;
}

// Offset a pointer by i bytes.
template<typename T>
MGPU_HOST_DEVICE const T* PtrOffset(const T* p, ptrdiff_t i) {
	return (const T*)((const byte*)p + i);
}
template<typename T>
MGPU_HOST_DEVICE T* PtrOffset(T* p, ptrdiff_t i) {
	return (T*)((byte*)p + i);
}

////////////////////////////////////////////////////////////////////////////////
// Task range support
// Evenly distributes variable-length arrays over a fixed number of CTAs.

MGPU_HOST int2 DivideTaskRange(int numItems, int numWorkers) {
	div_t d = div(numItems, numWorkers);
	return make_int2(d.quot, d.rem);
}

MGPU_HOST_DEVICE int2 ComputeTaskRange(int block, int2 task) {
	int2 range;
	range.x = task.x * block;
	range.x += min(block, task.y);
	range.y = range.x + task.x + (block < task.y);
	return range;
}

MGPU_HOST_DEVICE int2 ComputeTaskRange(int block, int2 task, int blockSize,
	int count) {
	int2 range = ComputeTaskRange(block, task);
	range.x *= blockSize;
	range.y = min(count, range.y * blockSize);
	return range;
}

////////////////////////////////////////////////////////////////////////////////
// DeviceExtractHeadFlags
// Input array flags is a bit array with 32 head flags per word.
// ExtractThreadHeadFlags returns numBits flags starting at bit index.

MGPU_HOST_DEVICE uint DeviceExtractHeadFlags(const uint* flags, int index,
	int numBits) {

	int index2 = index>> 5;
	int shift = 31 & index;
	uint headFlags = flags[index2]>> shift;
	int shifted = 32 - shift;

	if(shifted < numBits)
		// We also need to shift in the next set of bits.
		headFlags = bfi(flags[index2 + 1], headFlags, shifted, shift);
	headFlags &= (1<< numBits) - 1;
	return headFlags;
}

////////////////////////////////////////////////////////////////////////////////
// DevicePackHeadFlags
// Pack VT bits per thread at 32 bits/thread. Will consume an integer number of
// words, because CTA size is a multiple of 32. The first NT * VT / 32 threads
// return packed words.

template<int NT, int VT>
MGPU_DEVICE uint DevicePackHeadFlags(uint threadBits, int tid,
	uint* flags_shared) {

	const int WordCount = NT * VT / 32;

	// Each thread stores its thread bits to flags_shared[tid].
	flags_shared[tid] = threadBits;
	__syncthreads();

	uint packed = 0;
	if(tid < WordCount) {
		const int Items = MGPU_DIV_UP(32, VT);
		int index = 32 * tid;
		int first = index / VT;
		int bit = 0;

		int rem = index - VT * first;
		packed = flags_shared[first]>> rem;
		bit = VT - rem;
		++first;

		#pragma unroll
		for(int i = 0; i < Items; ++i) {
			if(i < Items - 1 || bit < 32) {
				uint x = flags_shared[first + i];
				if(bit < 32) packed |= x<< bit;
				bit += VT;
			}
		}
	}
	__syncthreads();

	return packed;
}

} // namespace mgpu


================================================
FILE: 3rdparty/ctc_include/contrib/moderngpu/include/device/intrinsics.cuh
================================================
/******************************************************************************
 * Copyright (c) 2013, NVIDIA CORPORATION.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *     * Redistributions of source code must retain the above copyright
 *       notice, this list of conditions and the following disclaimer.
 *     * Redistributions in binary form must reproduce the above copyright
 *       notice, this list of conditions and the following disclaimer in the
 *       documentation and/or other materials provided with the distribution.
 *     * Neither the name of the NVIDIA CORPORATION nor the
 *       names of its contributors may be used to endorse or promote products
 *       derived from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 ******************************************************************************/

/******************************************************************************
 *
 * Code and text by Sean Baxter, NVIDIA Research
 * See http://nvlabs.github.io/moderngpu for repository and documentation.
 *
 ******************************************************************************/

#include "devicetypes.cuh"

#pragma once

#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wstrict-aliasing"

namespace mgpu {

MGPU_HOST_DEVICE uint2 ulonglong_as_uint2(uint64 x) {
	return *reinterpret_cast<uint2*>(&x);
}
MGPU_HOST_DEVICE uint64 uint2_as_ulonglong(uint2 x) {
	return *reinterpret_cast<uint64*>(&x);
}

MGPU_HOST_DEVICE int2 longlong_as_int2(int64 x) {
	return *reinterpret_cast<int2*>(&x);
}
MGPU_HOST_DEVICE int64 int2_as_longlong(int2 x) {
	return *reinterpret_cast<int64*>(&x);
}

MGPU_HOST_DEVICE int2 double_as_int2(double x) {
	return *reinterpret_cast<int2*>(&x);
}
MGPU_HOST_DEVICE double int2_as_double(int2 x) {
	return *reinterpret_cast<double*>(&x);
}

MGPU_HOST_DEVICE void SetDoubleX(double& d, int x) {
	reinterpret_cast<int*>(&d)[0] = x;
}
MGPU_HOST_DEVICE int GetDoubleX(double d) {
	return double_as_int2(d).x;
}
MGPU_HOST_DEVICE void SetDoubleY(double& d, int y) {
	reinterpret_cast<int*>(&d)[1] = y;
}
MGPU_HOST_DEVICE int GetDoubleY(double d) {
	return double_as_int2(d).y;
}


////////////////////////////////////////////////////////////////////////////////
// PTX for bfe and bfi

#if __CUDA_ARCH__ >= 200

MGPU_DEVICE uint bfe_ptx(uint x, uint bit, uint numBits) {
	uint result;
	asm("bfe.u32 %0, %1, %2, %3;" :
		"=r"(result) : "r"(x), "r"(bit), "r"(numBits));
	return result;
}


MGPU_DEVICE uint bfi_ptx(uint x, uint y, uint bit, uint numBits) {
	uint result;
	asm("bfi.b32 %0, %1, %2, %3, %4;" :
		"=r"(result) : "r"(x), "r"(y), "r"(bit), "r"(numBits));
	return result;
}

MGPU_DEVICE uint prmt_ptx(uint a, uint b, uint index) {
	uint ret;
	asm("prmt.b32 %0, %1, %2, %3;" : "=r"(ret) : "r"(a), "r"(b), "r"(index));
	return ret;
}

#endif // __CUDA_ARCH__ >= 200

#if CUDA_VERSION >= 9000
////////////////////////////////////////////////////////////////////////////////
// shfl_add

MGPU_DEVICE int shfl_add(int x, int offset, int width = WARP_SIZE, unsigned int threadmask = 0xFFFFFFFF) {
	int result = 0;
#if __CUDA_ARCH__ >= 300
	int mask = (WARP_SIZE - width)<< 8;
	asm(
		"{.reg .s32 r0;"
		".reg .pred p;"
		"shfl.sync.up.b32 r0|p, %1, %2, %3, %4;"
		"@p add.s32 r0, r0, %5;"
		"mov.s32 %0, r0; }"
		: "=r"(result) : "r"(x), "r"(offset), "r"(mask), "r"(threadmask), "r"(x));
#endif
	return result;
}

MGPU_DEVICE int shfl_max(int x, int offset, int width = WARP_SIZE, unsigned int threadmask = 0xFFFFFFFF) {
	int result = 0;
#if __CUDA_ARCH__ >= 300
	int mask = (WARP_SIZE - width)<< 8;
	asm(
		"{.reg .s32 r0;"
		".reg .pred p;"
		"shfl.sync.up.b32 r0|p, %1, %2, %3, %4;"
		"@p max.s32 r0, r0, %5;"
		"mov.s32 %0, r0; }"
		: "=r"(result) : "r"(x), "r"(offset), "r"(mask), "r"(threadmask), "r"(x));
#endif
	return result;
}
#else
////////////////////////////////////////////////////////////////////////////////
// shfl_add

MGPU_DEVICE int shfl_add(int x, int offset, int width = WARP_SIZE) {
	int result = 0;
#if __CUDA_ARCH__ >= 300
	int mask = (WARP_SIZE - width)<< 8;
	asm(
		"{.reg .s32 r0;"
		".reg .pred p;"
		"shfl.up.b32 r0|p, %1, %2, %3;"
		"@p add.s32 r0, r0, %4;"
		"mov.s32 %0, r0; }"
		: "=r"(result) : "r"(x), "r"(offset), "r"(mask), "r"(x));
#endif
	return result;
}

MGPU_DEVICE int shfl_max(int x, int offset, int width = WARP_SIZE) {
	int result = 0;
#if __CUDA_ARCH__ >= 300
	int mask = (WARP_SIZE - width)<< 8;
	asm(
		"{.reg .s32 r0;"
		".reg .pred p;"
		"shfl.up.b32 r0|p, %1, %2, %3;"
		"@p max.s32 r0, r0, %4;"
		"mov.s32 %0, r0; }"
		: "=r"(result) : "r"(x), "r"(offset), "r"(mask), "r"(x));
#endif
	return result;
}
#endif

////////////////////////////////////////////////////////////////////////////////
// brev, popc, clz, bfe, bfi, prmt

// Reverse the bits in an integer.
MGPU_HOST_DEVICE uint brev(uint x) {
#if __CUDA_ARCH__ >= 200
	uint y = __brev(x);
#else
	uint y = 0;
	for(int i = 0; i < 32; ++i)
		y |= (1 & (x>> i))<< (31 - i);
#endif
	return y;
}

// Count number of bits in a register.
MGPU_HOST_DEVICE int popc(uint x) {
#if __CUDA_ARCH__ >= 200
	return __popc(x);
#else
	int c;
	for(c = 0; x; ++c)
		x &= x - 1;
	return c;
#endif
}

// Count leading zeros - start from most significant bit.
MGPU_HOST_DEVICE int clz(int x) {
#if __CUDA_ARCH__ >= 200
	return __clz(x);
#else
	for(int i = 31; i >= 0; --i)
		if((1<< i) & x) return 31 - i;
	return 32;
#endif
}

// Find first set - start from least significant bit. LSB is 1. ffs(0) is 0.
MGPU_HOST_DEVICE int ffs(int x) {
#if __CUDA_ARCH__ >= 200
	return __ffs(x);
#else
	for(int i = 0; i < 32; ++i)
		if((1<< i) & x) return i + 1;
	return 0;
#endif
}

MGPU_HOST_DEVICE uint bfe(uint x, uint bit, uint numBits) {
#if __CUDA_ARCH__ >= 200
	return bfe_ptx(x, bit, numBits);
#else
	return ((1<< numBits) - 1) & (x>> bit);
#endif
}

MGPU_HOST_DEVICE uint bfi(uint x, uint y, uint bit, uint numBits) {
	uint result;
#if __CUDA_ARCH__ >= 200
	result = bfi_ptx(x, y, bit, numBits);
#else
	if(bit + numBits > 32) numBits = 32 - bit;
	uint mask = ((1<< numBits) - 1)<< bit;
	result = y & ~mask;
	result |= mask & (x<< bit);
#endif
	return result;
}

MGPU_HOST_DEVICE uint prmt(uint a, uint b, uint index) {
	uint result;
#if __CUDA_ARCH__ >= 200
	result = prmt_ptx(a, b, index);
#else
	result = 0;
	for(int i = 0; i < 4; ++i) {
		uint sel = 0xf & (index>> (4 * i));
		uint x = ((7 & sel) > 3) ? b : a;
		x = 0xff & (x>> (8 * (3 & sel)));
		if(8 & sel) x = (128 & x) ? 0xff : 0;
		result |= x<< (8 * i);
	}
#endif
	return result;
}

// Find log2(x) and optionally round up to the next integer logarithm.
MGPU_HOST_DEVICE int FindLog2(int x, bool roundUp = false) {
	int a = 31 - clz(x);
	if(roundUp) a += !MGPU_IS_POW_2(x);
	return a;
}

////////////////////////////////////////////////////////////////////////////////
// vset4

#if __CUDA_ARCH__ >= 300

// Performs four byte-wise comparisons and returns 1 for each byte that
// satisfies the conditional, and zero otherwise.
MGPU_DEVICE uint vset4_lt_add_ptx(uint a, uint b, uint c) {
	uint result;
	asm("vset4.u32.u32.lt.add %0, %1, %2, %3;" :
		"=r"(result) : "r"(a), "r"(b), "r"(c));
	return result;
}
MGPU_DEVICE uint vset4_eq_ptx(uint a, uint b) {
	uint result;
	asm("vset4.u32.u32.eq %0, %1, %2, %3;" :
		"=r"(result) : "r"(a), "r"(b), "r"(0));
	return result;
}
#endif // __CUDA_ARCH__ >= 300

MGPU_HOST_DEVICE uint vset4_lt_add(uint a, uint b, uint c) {
	uint result;
#if __CUDA_ARCH__ >= 300
	result = vset4_lt_add_ptx(a, b, c);
#else
	result = c;
	if((0x000000ff & a) < (0x000000ff & b)) result += 0x00000001;
	if((0x0000ff00 & a) < (0x0000ff00 & b)) result += 0x00000100;
	if((0x00ff0000 & a) < (0x00ff0000 & b)) result += 0x00010000;
	if((0xff000000 & a) < (0xff000000 & b)) result += 0x01000000;
#endif
	return result;
}

MGPU_HOST_DEVICE uint vset4_eq(uint a, uint b) {
	uint result;
#if __CUDA_ARCH__ >= 300
	result = vset4_eq_ptx(a, b);
#else
	result = 0;
	if((0x000000ff & a) == (0x000000ff & b)) result = 0x00000001;
	if((0x0000ff00 & a) == (0x0000ff00 & b)) result += 0x00000100;
	if((0x00ff0000 & a) == (0x00ff0000 & b)) result += 0x00010000;
	if((0xff000000 & a) == (0xff000000 & b)) result += 0x01000000;
#endif
	return result;
}

////////////////////////////////////////////////////////////////////////////////
//

MGPU_HOST_DEVICE uint umulhi(uint x, uint y) {
#if __CUDA_ARCH__ >= 100
	return __umulhi(x, y);
#else
	uint64 product = (uint64)x * y;
	return (uint)(product>> 32);
#endif
}

////////////////////////////////////////////////////////////////////////////////
// ldg() function defined for all devices and all types. Only compiles to __ldg
// intrinsic for __CUDA_ARCH__ >= 320 && __CUDA_ARCH__ < 400 for types supported
// by __ldg in sm_32_intrinsics.h

template<typename T>
struct IsLdgType {
	enum { value = false };
};
#define DEFINE_LDG_TYPE(T) \
	template<> struct IsLdgType<T> { enum { value = true }; };

template<typename T, bool UseLDG = IsLdgType<T>::value>
struct LdgShim {
	MGPU_DEVICE static T Ldg(const T* p) {
		return *p;
	}
};

#if __CUDA_ARCH__ >= 320 && __CUDA_ARCH__ < 400

	// List of __ldg-compatible types from sm_32_intrinsics.h.
	DEFINE_LDG_TYPE(char)
	DEFINE_LDG_TYPE(short)
	DEFINE_LDG_TYPE(int)
	DEFINE_LDG_TYPE(long long)
	DEFINE_LDG_TYPE(char2)
	DEFINE_LDG_TYPE(char4)
	DEFINE_LDG_TYPE(short2)
	DEFINE_LDG_TYPE(short4)
	DEFINE_LDG_TYPE(int2)
	DEFINE_LDG_TYPE(int4)
	DEFINE_LDG_TYPE(longlong2)

	DEFINE_LDG_TYPE(unsigned char)
	DEFINE_LDG_TYPE(unsigned short)
	DEFINE_LDG_TYPE(unsigned int)
	DEFINE_LDG_TYPE(unsigned long long)
	DEFINE_LDG_TYPE(uchar2)
	DEFINE_LDG_TYPE(uchar4)
	DEFINE_LDG_TYPE(ushort2)
	DEFINE_LDG_TYPE(ushort4)
	DEFINE_LDG_TYPE(uint2)
	DEFINE_LDG_TYPE(uint4)
	DEFINE_LDG_TYPE(ulonglong2)

	DEFINE_LDG_TYPE(float)
	DEFINE_LDG_TYPE(double)
	DEFINE_LDG_TYPE(float2)
	DEFINE_LDG_TYPE(float4)
	DEFINE_LDG_TYPE(double2)

	template<typename T> struct LdgShim<T, true> {
		MGPU_DEVICE static T Ldg(const T* p) {
			return __ldg(p);
		}
	};
#endif

template<typename T>
MGPU_DEVICE T ldg(const T* p) {
	return LdgShim<T>::Ldg(p);
}

////////////////////////////////////////////////////////////////////////////////

// Fast division for 31-bit integers.
// Uses the method in Hacker's Delight (2nd edition) page 228.
// Evaluates for denom > 1 and x < 2^31.
struct FastDivide {
	uint denom;
	uint coef;
	uint shift;

	MGPU_HOST_DEVICE uint Divide(uint x) {
		return umulhi(x, coef)>> shift;
	}
	MGPU_HOST_DEVICE uint Modulus(uint x) {
		return x - Divide(x) * denom;
	}

	explicit FastDivide(uint denom_) {
		denom = denom_;
		uint p = 31 + FindLog2(denom, true);
		coef = (uint)(((1ull<< p) + denom - 1) / denom);
		shift = p - 32;
	}
};

#pragma GCC diagnostic pop

} // namespace mgpu


================================================
FILE: 3rdparty/ctc_include/contrib/moderngpu/include/device/loadstore.cuh
================================================
/******************************************************************************
 * Copyright (c) 2013, NVIDIA CORPORATION.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *     * Redistributions of source code must retain the above copyright
 *       notice, this list of conditions and the following disclaimer.
 *     * Redistributions in binary form must reproduce the above copyright
 *       notice, this list of conditions and the following disclaimer in the
 *       documentation and/or other materials provided with the distribution.
 *     * Neither the name of the NVIDIA CORPORATION nor the
 *       names of its contributors may be used to endorse or promote products
 *       derived from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 ******************************************************************************/

/******************************************************************************
 *
 * Code and text by Sean Baxter, NVIDIA Research
 * See http://nvlabs.github.io/moderngpu for repository and documentation.
 *
 ******************************************************************************/

#pragma once

#include "../mgpudevice.cuh"
#include "deviceutil.cuh"
#include "intrinsics.cuh"

namespace mgpu {

////////////////////////////////////////////////////////////////////////////////
// Cooperative load functions.

template<int NT, int VT, typename InputIt, typename T>
MGPU_DEVICE void DeviceSharedToReg(InputIt data, int tid, T* reg,
	bool sync) {

	#pragma unroll
	for(int i = 0; i < VT; ++i)
		reg[i] = data[NT * i + tid];

	if(sync) __syncthreads();
}

template<int NT, int VT, typename InputIt, typename T>
MGPU_DEVICE void DeviceGlobalToRegPred(int count, InputIt data, int tid,
	T* reg, bool sync) {

	// TODO: Attempt to issue 4 loads at a time.
	#pragma unroll
	for(int i = 0; i < VT; ++i) {
		int index = NT * i + tid;
		if(index < count) reg[i] = data[index];
	}
	if(sync) __syncthreads();
}

template<int NT, int VT, typename InputIt, typename T>
MGPU_DEVICE void DeviceGlobalToReg(int count, InputIt data, int tid,
	T* reg, bool sync) {

	if(count >= NT * VT) {
		#pragma unroll
		for(int i = 0; i < VT; ++i)
			reg[i] = data[NT * i + tid];
	} else
		DeviceGlobalToRegPred<NT, VT>(count, data, tid, reg, false);
	if(sync) __syncthreads();
}
template<int NT, int VT0, int VT1, typename InputIt, typename T>
MGPU_DEVICE void DeviceGlobalToReg2(int count, InputIt data, int tid,
	T* reg, bool sync) {

	DeviceGlobalToReg<NT, VT0>(count, data, tid, reg, false);
	#pragma unroll
	for(int i = VT0; i < VT1; ++i) {
		int index = NT * i + tid;
		if(index < count) reg[i] = data[index];
	}
	if(sync) __syncthreads();
}

template<int NT, int VT, typename InputIt, typename T>
MGPU_DEVICE void DeviceGlobalToRegDefault(int count, InputIt data, int tid,
	T* reg, T init, bool sync) {

	if(count >= NT * VT) {
		#pragma unroll
		for(int i = 0; i < VT; ++i)
			reg[i] = data[NT * i + tid];
	} else {
		#pragma unroll
		for(int i = 0; i < VT; ++i) {
			int index = NT * i + tid;
			reg[i] = init;
			if(index < count) reg[i] = data[index];
		}
	}
	if(sync) __syncthreads();
}
template<int NT, int VT0, int VT1, typename InputIt, typename T>
MGPU_DEVICE void DeviceGlobalToRegDefault2(int count, InputIt data, int tid,
	T* reg, T init, bool sync) {

	DeviceGlobalToRegDefault<NT, VT0>(count, data, tid, reg, init, false);
	#pragma unroll
	for(int i = VT0; i < VT1; ++i) {
		int index = NT * i + tid;
		reg[i] = init;
		if(index < count) reg[i] = data[index];
	}
	if(sync) __syncthreads();
}

////////////////////////////////////////////////////////////////////////////////

template<int NT, int VT, typename InputIt, typename T>
MGPU_DEVICE void DeviceGlobalToThread(int count, InputIt data, int tid,
	T* reg) {

	data += VT * tid;
	if(count >= NT * VT) {
		#pragma unroll
		for(int i = 0; i < VT; ++i)
			reg[i] = ldg(data + i);
	} else {
		count -= VT * tid;
		#pragma unroll
		for(int i = 0; i < VT; ++i)
			if(i < count) reg[i] = ldg(data + i);
	}
}

template<int NT, int VT, typename InputIt, typename T>
MGPU_DEVICE void DeviceGlobalToThreadDefault(int count, InputIt data, int tid,
	T* reg, T init) {

	data += VT * tid;
	if(count >= NT * VT) {
		#pragma unroll
		for(int i = 0; i < VT; ++i)
			reg[i] = ldg(data + i);
	} else {
		count -= VT * tid;
		#pragma unroll
		for(int i = 0; i < VT; ++i)
			reg[i] = (i < count) ? ldg(data + i) : init;
	}
}


////////////////////////////////////////////////////////////////////////////////
// Cooperative store functions.

template<int NT, int VT, typename OutputIt, typename T>
MGPU_DEVICE void DeviceRegToShared(const T* reg, int tid,
	OutputIt dest, bool sync) {

	typedef typename std::iterator_traits<OutputIt>::value_type T2;
	#pragma unroll
	for(int i = 0; i < VT; ++i)
		dest[NT * i + tid] = (T2)reg[i];

	if(sync) __syncthreads();
}

template<int NT, int VT, typename OutputIt, typename T>
MGPU_DEVICE void DeviceRegToGlobal(int count, const T* reg, int tid,
	OutputIt dest, bool sync) {

	#pragma unroll
	for(int i = 0; i < VT; ++i) {
		int index = NT * i + tid;
		if(index < count)
			dest[index] = reg[i];
	}
	if(sync) __syncthreads();
}

////////////////////////////////////////////////////////////////////////////////
// DeviceMemToMemLoop
// Transfer from shared memory to global, or global to shared, for transfers
// that are smaller than NT * VT in the average case. The goal is to reduce
// unnecessary comparison logic.

template<int NT, int VT, typename InputIt, typename OutputIt>
MGPU_DEVICE void DeviceMemToMem4(int count, InputIt source, int tid,
	OutputIt dest, bool sync) {

	typedef typename std::iterator_traits<InputIt>::value_type T;

	T x[VT];
	const int Count = (VT < 4) ? VT : 4;
	if(count >= NT * VT) {
		#pragma unroll
		for(int i = 0; i < Count; ++i)
			x[i] = source[NT * i + tid];
		#pragma unroll
		for(int i = 0; i < Count; ++i)
			dest[NT * i + tid] = x[i];
	} else {
		#pragma unroll
		for(int i = 0; i < Count; ++i) {
			int index = NT * i + tid;
			if(index < count)
				x[i] = source[NT * i + tid];
		}
		#pragma unroll
		for(int i = 0; i < Count; ++i) {
			int index = NT * i + tid;
			if(index < count)
				dest[index] = x[i];
		}
	}
	if(sync) __syncthreads();
}
template<int NT, typename InputIt, typename OutputIt>
MGPU_DEVICE void DeviceMemToMemLoop(int count, InputIt source, int tid,
	OutputIt dest, bool sync) {

	for(int i = 0; i < count; i += 4 * NT)
		DeviceMemToMem4<NT, 4>(count - i, source + i, tid, dest + i,
			false);
	if(sync) __syncthreads();
}


////////////////////////////////////////////////////////////////////////////////
// Functions to copy between shared and global memory where the average case is
// to transfer NT * VT elements.

template<int NT, int VT, typename T, typename OutputIt>
MGPU_DEVICE void DeviceSharedToGlobal(int count, const T* source, int tid,
	OutputIt dest, bool sync) {

	typedef typename std::iterator_traits<OutputIt>::value_type T2;
	#pragma unroll
	for(int i = 0; i < VT; ++i) {
		int index = NT * i + tid;
		if(index < count) dest[index] = (T2)source[index];
	}
	if(sync) __syncthreads();
}

template<int NT, int VT, typename InputIt, typename T>
MGPU_DEVICE void DeviceGlobalToShared(int count, InputIt source, int tid,
	T* dest, bool sync) {

	T reg[VT];
	DeviceGlobalToReg<NT, VT>(count, source, tid, reg, false);
	DeviceRegToShared<NT, VT>(reg, tid, dest, sync);
}

template<int NT, int VT0, int VT1, typename InputIt, typename T>
MGPU_DEVICE void DeviceGlobalToShared2(int count, InputIt source, int tid,
	T* dest, bool sync) {

	T reg[VT1];
	DeviceGlobalToReg2<NT, VT0, VT1>(count, source, tid, reg, false);
	DeviceRegToShared<NT, VT1>(reg, tid, dest, sync);
}


template<int NT, int VT, typename InputIt, typename T>
MGPU_DEVICE void DeviceGlobalToSharedDefault(int count, InputIt source, int tid,
	T* dest, T init, bool sync) {

	T reg[VT];
	DeviceGlobalToRegDefault<NT, VT>(count, source, tid, reg, init, false);
	DeviceRegToShared<NT, VT>(reg, tid, dest, sync);
}

template<int NT, int VT0, int VT1, typename InputIt, typename T>
MGPU_DEVICE void DeviceGlobalToSharedDefault2(int count, InputIt data, int tid,
	T* dest, T init, bool sync) {

	T reg[VT1];
	DeviceGlobalToRegDefault2<NT, VT0, VT1>(count, data, tid, reg, init, false);
	DeviceRegToShared<NT, VT1>(reg, tid, dest, sync);
}


////////////////////////////////////////////////////////////////////////////////

template<int NT, int VT, typename InputIt, typename T>
MGPU_DEVICE void DeviceGlobalToSharedLoop(int count, InputIt source, int tid,
	T* dest, bool sync) {

	const int Granularity = MGPU_MIN(VT, 3);
	DeviceGlobalToShared<NT, Granularity>(count, source, tid, dest, false);

	int offset = Granularity * NT;
	if(count > offset)
		DeviceGlobalToShared<NT, VT - Granularity>(count - offset,
			source + offset, tid, dest + offset, false);

	if(sync) __syncthreads();

	/*
	source += tid;
	while(count > 0) {
		T reg[Granularity];
		#pragma unroll
		for(int i = 0; i < Granularity; ++i) {
			int index = NT * i + tid;
			if(index < count)
				reg[i] = source[NT * i];
		}
		DeviceRegToShared<NT, Granularity>(reg, tid, dest, false);
		source += Granularity * NT;
		dest += Granularity * NT;
		count -= Granularity * NT;
	}
	if(sync) __syncthreads();*/
}

template<int NT, int VT, typename InputIt, typename OutputIt>
MGPU_DEVICE void DeviceGlobalToGlobal(int count, InputIt source, int tid,
	OutputIt dest, bool sync) {

	typedef typename std::iterator_traits<OutputIt>::value_type T;
	T values[VT];
	DeviceGlobalToReg<NT, VT>(count, source, tid, values, false);
	DeviceRegToGlobal<NT, VT>(count, values, tid, dest, sync);
}

////////////////////////////////////////////////////////////////////////////////
// Transponse VT elements in NT threads (x) into thread-order registers (y)
// using only NT * VT / 2 elements of shared memory.

//This function definitely has a bug, don't use!!! fix TODO(erich)
template<int NT, int VT, typename T>
MGPU_DEVICE void HalfSmemTranspose(const T* x, int tid, T* shared, T* y) {
    printf("HalfSmemTranspose has a bug, use WAR SmemTranpose or find bug before using in production");
	// Transpose the first half values (tid < NT / 2)
	#pragma unroll
	for(int i = 0; i <= VT / 2; ++i)
		if(i < VT / 2 || tid < NT / 2)
			shared[NT * i + tid] = x[i];
	__syncthreads();

	if(tid < NT / 2) {
		#pragma unroll
		for(int i = 0; i < VT; ++i)
			y[i] = shared[VT * tid + i];
	}
	__syncthreads();

	// Transpose the second half values (tid >= NT / 2)
	#pragma unroll
	for(int i = VT / 2; i < VT; ++i)
		if(i > VT / 2 || tid >= NT / 2)
			shared[NT * i - NT * VT / 2 + tid] = x[i];
	__syncthreads();

	if(tid >= NT / 2) {
		#pragma unroll
		for(int i = 0; i < VT; ++i)
			y[i] = shared[VT * tid + i - NT * VT / 2];
	}
	__syncthreads();
}

////////////////////////////////////////////////////////////////////////////////
// Gather/scatter functions

template<int NT, int VT, typename InputIt, typename T>
MGPU_DEVICE void DeviceGather(int count, InputIt data, int indices[VT],
	int tid, T* reg, bool sync) {

	if(count >= NT * VT) {
		#pragma unroll
		for(int i = 0; i < VT; ++i)
			reg[i] = data[indices[i]];
	} else {
		#pragma unroll
		for(int i = 0; i < VT; ++i) {
			int index = NT * i + tid;
			if(index < count)
				reg[i] = data[indices[i]];
		}
	}
	if(sync) __syncthreads();
}

template<int NT, int VT, typename InputIt, typename T>
MGPU_DEVICE void DeviceGatherDefault(int count, InputIt data, int indices[VT],
	int tid, T* reg, T identity, bool sync) {

	if(count >= NT * VT) {
		#pragma unroll
		for(int i = 0; i < VT; ++i)
			reg[i] = data[indices[i]];
	} else {
		#pragma unroll
		for(int i = 0; i < VT; ++i) {
			int index = NT * i + tid;
			reg[i] = (index < count) ? data[indices[i]] : identity;
		}
	}
	if(sync) __syncthreads();
}

template<int NT, int VT, typename T, typename OutputIt>
MGPU_DEVICE void DeviceScatter(int count, const T* reg, int tid,
	int indices[VT], OutputIt data, bool sync) {

	if(count >= NT * VT) {
		#pragma unroll
		for(int i = 0; i < VT; ++i)
			data[indices[i]] = reg[i];
	} else {
		#pragma unroll
		for(int i = 0; i < VT; ++i) {
			int index = NT * i + tid;
			if(index < count)
				data[indices[i]] = reg[i];
		}
	}
	if(sync) __syncthreads();
}

////////////////////////////////////////////////////////////////////////////////
// Cooperative transpose functions (strided to thread order)

template<int VT, typename T>
MGPU_DEVICE void DeviceThreadToShared(const T* threadReg, int tid, T* shared,
	bool sync) {

	if(1 & VT) {
		// Odd grain size. Store as type T.
		#pragma unroll
		for(int i = 0; i < VT; ++i)
			shared[VT * tid + i] = threadReg[i];
	} else {
		// Even grain size. Store as DevicePair<T>. This lets us exploit the
		// 8-byte shared memory mode on Kepler.
		DevicePair<T>* dest = (DevicePair<T>*)(shared + VT * tid);
		#pragma unroll
		for(int i = 0; i < VT / 2; ++i)
			dest[i] = MakeDevicePair(threadReg[2 * i], threadReg[2 * i + 1]);
	}
	if(sync) __syncthreads();
}

template<int VT, typename T>
MGPU_DEVICE void DeviceSharedToThread(const T* shared, int tid, T* threadReg,
	bool sync) {

	if(1 & VT) {
		#pragma unroll
		for(int i = 0; i < VT; ++i)
			threadReg[i] = shared[VT * tid + i];
	} else {
		const DevicePair<T>* source = (const DevicePair<T>*)(shared + VT * tid);
		#pragma unroll
		for(int i = 0; i < VT / 2; ++i) {
			DevicePair<T> p = source[i];
			threadReg[2 * i] = p.x;
			threadReg[2 * i + 1] = p.y;
		}
	}
	if(sync) __syncthreads();
}

////////////////////////////////////////////////////////////////////////////////
// DeviceLoad2 - load from pointers of the same type. Optimize for a single LD
// statement.

template<int NT, int VT0, int VT1, typename T>
MGPU_DEVICE void DeviceLoad2ToReg(const T* a_global, int aCount,
	const T* b_global, int bCount, int tid, T* reg, bool sync) {

	int b0 = b_global - a_global - aCount;
	int total = aCount + bCount;
	if(total >= NT * VT0) {
		#pragma unroll
		for(int i = 0; i < VT0; ++i) {
			int index = NT * i + tid;
			reg[i] = a_global[index + ((index >= aCount) ? b0 : 0)];
		}
	} else {
		#pragma unroll
		for(int i = 0; i < VT0; ++i) {
			int index = NT * i + tid;
			if(index < total)
				reg[i] = a_global[index + ((index >= aCount) ? b0 : 0)];
		}
	}
	#pragma unroll
	for(int i = VT0; i < VT1; ++i) {
		int index = NT * i + tid;
		if(index < total)
			reg[i] = a_global[index + ((index >= aCount) ? b0 : 0)];
	}
}

template<int NT, int VT0, int VT1, typename T>
MGPU_DEVICE void DeviceLoad2ToShared(const T* a_global, int aCount,
	const T* b_global, int bCount, int tid, T* shared, bool sync) {

	T reg[VT1];
	DeviceLoad2ToReg<NT, VT0, VT1>(a_global, aCount, b_global, bCount, tid,
		reg, false);
	DeviceRegToShared<NT, VT1>(reg, tid, shared, sync);
}

////////////////////////////////////////////////////////////////////////////////
// DeviceLoad2 - load from pointers of different types. Uses two LD statements.

template<int NT, int VT0, int VT1, typename InputIt1, typename InputIt2,
	typename T>
MGPU_DEVICE void DeviceLoad2ToReg(InputIt1 a_global, int aCount,
	InputIt2 b_global, int bCount, int tid, T* reg, bool sync)  {

	b_global -= aCount;
	int total = aCount + bCount;
	if(total >= NT * VT0) {
		#pragma unroll
		for(int i = 0; i < VT0; ++i) {
			int index = NT * i + tid;
			if(index < aCount) reg[i] = a_global[index];
			else reg[i] = b_global[index];
		}
	} else {
		#pragma unroll
		for(int i = 0; i < VT0; ++i) {
			int index = NT * i + tid;
			if(index < aCount) reg[i] = a_global[index];
			else if(index < total) reg[i] = b_global[index];
		}
	}
	#pragma unroll
	for(int i = VT0; i < VT1; ++i) {
		int index = NT * i + tid;
		if(index < aCount) reg[i] = a_global[index];
		else if(index < total) reg[i] = b_global[index];
	}
}

template<int NT, int VT0, int VT1, typename InputIt1, typename InputIt2,
	typename T>
MGPU_DEVICE void DeviceLoad2ToShared(InputIt1 a_global, int aCount,
	InputIt2 b_global, int bCount, int tid, T* shared, bool sync) {

	T reg[VT1];
	DeviceLoad2ToReg<NT, VT0, VT1>(a_global, aCount, b_global, bCount, tid,
		reg, false);
	DeviceRegToShared<NT, VT1>(reg, tid, shared, sync);
}


////////////////////////////////////////////////////////////////////////////////
// DeviceGatherGlobalToGlobal

template<int NT, int VT, typename InputIt, typename OutputIt>
MGPU_DEVICE void DeviceGatherGlobalToGlobal(int count, InputIt data_global,
	const int* indices_shared, int tid, OutputIt dest_global, bool sync) {

	typedef typename std::iterator_traits<InputIt>::value_type ValType;
	ValType values[VT];

	#pragma unroll
	for(int i = 0; i < VT; ++i) {
		int index = NT * i + tid;
		if(index < count) {
			int gather = indices_shared[index];
			values[i] = data_global[gather];
		}
	}
	if(sync) __syncthreads();
	DeviceRegToGlobal<NT, VT>(count, values, tid, dest_global, false);
}

////////////////////////////////////////////////////////////////////////////////
// DeviceTransferMergeValues
// Gather in a merge-like value from two input arrays and store to a single
// output. Like DeviceGatherGlobalToGlobal, but for two arrays at once.

template<int NT, int VT, typename InputIt1, typename InputIt2,
	typename T>
MGPU_DEVICE void DeviceTransferMergeValuesReg(int count, InputIt1 a_global,
	InputIt2 b_global, int bStart, const int* indices, int tid,
	T* reg, bool sync) {

	b_global -= bStart;
	if(count >= NT * VT) {
		#pragma unroll
		for(int i = 0; i < VT; ++i) {
			reg[i] = (indices[i] < bStart) ? a_global[indices[i]] :
				b_global[indices[i]];
		}
	} else {
		#pragma unroll
		for(int i = 0; i < VT; ++i) {
			int index = NT * i + tid;
			if(index < count)
				reg[i] = (indices[i] < bStart) ? a_global[indices[i]] :
					b_global[indices[i]];
		}
	}
	if(sync) __syncthreads();
}

template<int NT, int VT, typename InputIt1, typename InputIt2,
	typename OutputIt>
MGPU_DEVICE void DeviceTransferMergeValuesShared(int count, InputIt1 a_global,
	InputIt2 b_global, int bStart, const int* indices_shared, int tid,
	OutputIt dest_global, bool sync) {

	int indices[VT];
	DeviceSharedToReg<NT, VT>(indices_shared, tid, indices);

	typedef typename std::iterator_traits<InputIt1>::value_type ValType;
	ValType reg[VT];
	DeviceTransferMergeValuesReg<NT, VT>(count, a_global, b_global, bStart,
		indices, tid, reg, sync);
	DeviceRegToGlobal<NT, VT>(count, reg, tid, dest_global, sync);
}

template<int NT, int VT, typename T>
MGPU_DEVICE void DeviceTransferMergeValuesReg(int count, const T* a_global,
	const T* b_global, int bStart, const int* indices, int tid, T* reg,
	bool sync) {

	int bOffset = (int)(b_global - a_global - bStart);

	if(count >= NT * VT) {
		#pragma unroll
		for(int i = 0; i < VT; ++i) {
			int gather = indices[i];
			if(gather >= bStart) gather += bOffset;
			reg[i] = a_global[gather];
		}
	} else {
		#pragma unroll
		for(int i = 0; i < VT; ++i) {
			int index = NT * i + tid;
			int gather = indices[i];
			if(gather >= bStart) gather += bOffset;
			if(index < count)
				reg[i] = a_global[gather];
		}
	}
	if(sync) __syncthreads();
}

template<int NT, int VT, typename T, typename OutputIt>
MGPU_DEVICE void DeviceTransferMergeValuesShared(int count, const T* a_global,
	const T* b_global, int bStart, const int* indices_shared, int tid,
	OutputIt dest_global, bool sync) {

	int indices[VT];
	DeviceSharedToReg<NT, VT>(indices_shared, tid, indices);

	T reg[VT];
	DeviceTransferMergeValuesReg<NT, VT>(count, a_global, b_global, bStart,
		indices, tid, reg, sync);
	DeviceRegToGlobal<NT, VT>(count, reg, tid, dest_global, sync);
}

} // namespace mgpu


================================================
FILE: 3rdparty/ctc_include/contrib/moderngpu/include/device/serialsets.cuh
================================================
/******************************************************************************
 * Copyright (c) 2013, NVIDIA CORPORATION.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *     * Redistributions of source code must retain the above copyright
 *       notice, this list of conditions and the following disclaimer.
 *     * Redistributions in binary form must reproduce the above copyright
 *       notice, this list of conditions and the following disclaimer in the
 *       documentation and/or other materials provided with the distribution.
 *     * Neither the name of the NVIDIA CORPORATION nor the
 *       names of its contributors may be used to endorse or promote products
 *       derived from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 ******************************************************************************/

/******************************************************************************
 *
 * Code and text by Sean Baxter, NVIDIA Research
 * See http://nvlabs.github.io/moderngpu for repository and documentation.
 *
 ******************************************************************************/

#pragma once

#include "deviceutil.cuh"

namespace mgpu {

////////////////////////////////////////////////////////////////////////////////
// SerialSetIntersection
// Emit A if A and B are in range and equal.

template<int VT, bool RangeCheck, typename T, typename Comp>
MGPU_DEVICE int SerialSetIntersection(const T* data, int aBegin, int aEnd,
	int bBegin, int bEnd, int end, T* results, int* indices, Comp comp) {

	const int MinIterations = VT / 2;
	int commit = 0;

	#pragma unroll
	for(int i = 0; i < VT; ++i) {
		bool test = RangeCheck ?
			((aBegin + bBegin < end) && (aBegin < aEnd) && (bBegin < bEnd)) :
			(i < MinIterations || (aBegin + bBegin < end));

		if(test) {
			T aKey = data[aBegin];
			T bKey = data[bBegin];

			bool pA = comp(aKey, bKey);
			bool pB = comp(bKey, aKey);

			// The outputs must come from A by definition of set interection.
			results[i] = aKey;
			indices[i] = aBegin;

			if(!pB) ++aBegin;
			if(!pA) ++bBegin;
			if(pA == pB) commit |= 1<< i;
		}
	}
	return commit;
}

////////////////////////////////////////////////////////////////////////////////
// SerialSetUnion
// Emit A if A <= B. Emit B if B < A.

template<int VT, bool RangeCheck, typename T, typename Comp>
MGPU_DEVICE int SerialSetUnion(const T* data, int aBegin, int aEnd,
	int bBegin, int bEnd, int end, T* results, int* indices, Comp comp) {

	const int MinIterations = VT / 2;
	int commit = 0;

	#pragma unroll
	for(int i = 0; i < VT; ++i) {
		bool test = RangeCheck ?
			(aBegin + bBegin < end) :
			(i < MinIterations || (aBegin + bBegin < end));

		if(test) {
			T aKey = data[aBegin];
			T bKey = data[bBegin];

			bool pA = false, pB = false;
			if(RangeCheck && aBegin >= aEnd)
				pB = true;
			else if(RangeCheck && bBegin >= bEnd)
				pA = true;
			else {
				// Both are in range.
				pA = comp(aKey, bKey);
				pB = comp(bKey, aKey);
			}

			// Output A in case of a tie, so check if b < a.
			results[i] = pB ? bKey : aKey;
			indices[i] = pB ? bBegin : aBegin;
			if(!pB) ++aBegin;
			if(!pA) ++bBegin;
			commit |= 1<< i;
		}
	}
	return commit;
}

////////////////////////////////////////////////////////////////////////////////
// SerialSetDifference
// Emit A if A < B.

template<int VT, bool RangeCheck, typename T, typename Comp>
MGPU_DEVICE int SerialSetDifference(const T* data, int aBegin, int aEnd,
	int bBegin, int bEnd, int end, T* results, int* indices, Comp comp) {

	const int MinIterations = VT / 2;
	int commit = 0;

	#pragma unroll
	for(int i = 0; i < VT; ++i) {
		bool test = RangeCheck ?
			(aBegin + bBegin < end) :
			(i < MinIterations || (aBegin + bBegin < end));
		if(test) {
			T aKey = data[aBegin];
			T bKey = data[bBegin];

			bool pA = false, pB = false;
			if(RangeCheck && aBegin >= aEnd)
				pB = true;
			else if(RangeCheck && bBegin >= bEnd)
				pA = true;
			else {
				pA = comp(aKey, bKey);
				pB = comp(bKey, aKey);
			}

			// The outputs must come from A by definition of set difference.
			results[i] = aKey;
			indices[i] = aBegin;
			if(!pB) ++aBegin;
			if(!pA) ++bBegin;
			if(pA) commit |= 1<< i;
		}
	}
	return commit;
}

////////////////////////////////////////////////////////////////////////////////
// SerialSetSymDiff
// Emit A if A < B and emit B if B < A.

template<int VT, bool RangeCheck, typename T, typename Comp>
MGPU_DEVICE int SerialSetSymDiff(const T* data, int aBegin, int aEnd,
	int bBegin, int bEnd, int end, T* results, int* indices, Comp comp) {

	const int MinIterations = VT / 2;
	int commit = 0;

	#pragma unroll
	for(int i = 0; i < VT; ++i) {
		bool test = RangeCheck ?
			(aBegin + bBegin < end) :
			(i < MinIterations || (aBegin + bBegin < end));
		if(test) {
			T aKey = data[aBegin];
			T bKey = data[bBegin];

			bool pA = false, pB = false;
			if(RangeCheck && (bBegin >= bEnd))
				pA = true;
			else if(RangeCheck && (aBegin >= aEnd))
				pB = true;
			else {
				pA = comp(aKey, bKey);
				pB = comp(bKey, aKey);
			}

			results[i] = pA ? aKey : bKey;
			indices[i] = pA ? aBegin : bBegin;
			if(!pA) ++bBegin;
			if(!pB) ++aBegin;
			if(pA != pB) commit |= 1<< i;
		}
	}
	return commit;
}

////////////////////////////////////////////////////////////////////////////////
// SerialSetOp
// Uses the MgpuSetOp enum to statically select one of the four serial ops
// above.

template<int VT, bool RangeCheck, MgpuSetOp Op, typename T, typename Comp>
MGPU_DEVICE int SerialSetOp(const T* data, int aBegin, int aEnd,
	int bBegin, int bEnd, int star, T* results, int* indices, Comp comp) {

	int end = aBegin + bBegin + VT - star;
	if(RangeCheck) end = min(end, aEnd + bEnd);
	int commit;
	switch(Op) {
		case MgpuSetOpIntersection:
			commit = SerialSetIntersection<VT, RangeCheck>(data, aBegin,
				aEnd, bBegin, bEnd, end, results, indices, comp);
			break;
		case MgpuSetOpUnion:
			commit = SerialSetUnion<VT, RangeCheck>(data, aBegin, aEnd,
				bBegin, bEnd, end, results, indices, comp);
			break;
		case MgpuSetOpDiff:
			commit = SerialSetDifference<VT, RangeCheck>(data, aBegin, aEnd,
				bBegin, bEnd, end, results, indices, comp);
			break;
		case MgpuSetOpSymDiff:
			commit = SerialSetSymDiff<VT, RangeCheck>(data, aBegin, aEnd,
				bBegin, bEnd, end, results, indices, comp);
			break;
	}
	__syncthreads();
	return commit;
}

} // namespace mgpu


================================================
FILE: 3rdparty/ctc_include/contrib/moderngpu/include/device/sortnetwork.cuh
================================================
/******************************************************************************
 * Copyright (c) 2013, NVIDIA CORPORATION.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *     * Redistributions of source code must retain the above copyright
 *       notice, this list of conditions and the following disclaimer.
 *     * Redistributions in binary form must reproduce the above copyright
 *       notice, this list of conditions and the following disclaimer in the
 *       documentation and/or other materials provided with the distribution.
 *     * Neither the name of the NVIDIA CORPORATION nor the
 *       names of its contributors may be used to endorse or promote products
 *       derived from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 ******************************************************************************/

/******************************************************************************
 *
 * Code and text by Sean Baxter, NVIDIA Research
 * See http://nvlabs.github.io/moderngpu for repository and documentation.
 *
 ******************************************************************************/

#pragma once

#include "deviceutil.cuh"

namespace mgpu {

////////////////////////////////////////////////////////////////////////////////
// Odd-even transposition sorting network. Sorts keys and values in-place in
// register.
// http://en.wikipedia.org/wiki/Odd%E2%80%93even_sort

// CUDA Compiler does not currently unroll these loops correctly. Write using
// template loop unrolling.
/*
template<int VT, typename T, typename V, typename Comp>
MGPU_DEVICE void OddEvenTransposeSort(T* keys, V* values, Comp comp) {
	#pragma unroll
	for(int level = 0; level < VT; ++level) {

		#pragma unroll
		for(int i = 1 & level; i < VT - 1; i += 2) {
			if(comp(keys[i + 1], keys[i])) {
				mgpu::swap(keys[i], keys[i + 1]);
				mgpu::swap(values[i], values[i + 1]);
			}
		}
	}
}*/

template<int I, int VT>
struct OddEvenTransposeSortT {
	// Sort segments marked by head flags. If the head flag between i and i + 1
	// is set (so that (2<< i) & flags is true), the values belong to different
	// segments and are not swapped.
	template<typename K, typename V, typename Comp>
	static MGPU_DEVICE void Sort(K* keys, V* values, int flags, Comp comp) {
		#pragma unroll
		for(int i = 1 & I; i < VT - 1; i += 2)
			if((0 == ((2<< i) & flags)) && comp(keys[i + 1], keys[i])) {
				mgpu::swap(keys[i], keys[i + 1]);
				mgpu::swap(values[i], values[i + 1]);
			}
		OddEvenTransposeSortT<I + 1, VT>::Sort(keys, values, flags, comp);
	}
};
template<int I> struct OddEvenTransposeSortT<I, I> {
	template<typename K, typename V, typename Comp>
	static MGPU_DEVICE void Sort(K* keys, V* values, int flags, Comp comp) { }
};

template<int VT, typename K, typename V, typename Comp>
MGPU_DEVICE void OddEvenTransposeSort(K* keys, V* values, Comp comp) {
	OddEvenTransposeSortT<0, VT>::Sort(keys, values, 0, comp);
}
template<int VT, typename K, typename V, typename Comp>
MGPU_DEVICE void OddEvenTransposeSortFlags(K* keys, V* values, int flags,
	Comp comp) {
	OddEvenTransposeSortT<0, VT>::Sort(keys, values, flags, comp);
}

////////////////////////////////////////////////////////////////////////////////
// Batcher Odd-Even Mergesort network
// Unstable but executes much faster than the transposition sort.
// http://en.wikipedia.org/wiki/Batcher_odd%E2%80%93even_mergesort

template<int Width, int Low, int Count>
struct OddEvenMergesortT {
	template<typename K, typename V, typename Comp>
	MGPU_DEVICE static void CompareAndSwap(K* keys, V* values, int flags,
		int a, int b, Comp comp) {
		if(b < Count) {
			// Mask the bits between a and b. Any head flags in this interval
			// means the keys are in different segments and must not be swapped.
			const int Mask = ((2<< b) - 1) ^ ((2<< a) - 1);
			if(!(Mask & flags) && comp(keys[b], keys[a])) {
				mgpu::swap(keys[b], keys[a]);
				mgpu::swap(values[b], values[a]);
			}
		}
	}

	template<int R, int Low2, bool Recurse = 2 * R < Width>
	struct OddEvenMerge {
		template<typename K, typename V, typename Comp>
		MGPU_DEVICE static void Merge(K* keys, V* values, int flags,
			Comp comp) {
			// Compare and swap
			const int M = 2 * R;
			OddEvenMerge<M, Low2>::Merge(keys, values, flags, comp);
			OddEvenMerge<M, Low2 + R>::Merge(keys, values, flags, comp);

			#pragma unroll
			for(int i = Low2 + R; i + R < Low2 + Width; i += M)
				CompareAndSwap(keys, values, flags, i, i + R, comp);
		}
	};
	template<int R, int Low2>
	struct OddEvenMerge<R, Low2, false> {
		template<typename K, typename V, typename Comp>
		MGPU_DEVICE static void Merge(K* keys, V* values, int flags,
			Comp comp) {
			CompareAndSwap(keys, values, flags, Low2, Low2 + R, comp);
		}
	};

	template<typename K, typename V, typename Comp>
	MGPU_DEVICE static void Sort(K* keys, V* values, int flags,
		Comp comp) {

		const int M = Width / 2;
		OddEvenMergesortT<M, Low, Count>::Sort(keys, values, flags, comp);
		OddEvenMergesortT<M, Low + M, Count>::Sort(keys, values, flags, comp);
		OddEvenMerge<1, Low>::Merge(keys, values, flags, comp);
	}
};
template<int Low, int Count> struct OddEvenMergesortT<1, Low, Count> {
	template<typename K, typename V, typename Comp>
	MGPU_DEVICE static void Sort(K* keys, V* values, int flags,
		Comp comp) { }
};

template<int VT, typename K, typename V, typename Comp>
MGPU_DEVICE void OddEvenMergesort(K* keys, V* values, Comp comp) {
	const int Width = 1<< sLogPow2<VT, true>::value;
	OddEvenMergesortT<Width, 0, VT>::Sort(keys, values, 0, comp);
}
template<int VT, typename K, typename V, typename Comp>
MGPU_DEVICE void OddEvenMergesortFlags(K* keys, V* values, int flags,
	Comp comp) {
	const int Width = 1<< sLogPow2<VT, true>::value;
	OddEvenMergesortT<Width, 0, VT>::Sort(keys, values, flags, comp);
}

} // namespace mgpu


================================================
FILE: 3rdparty/ctc_include/contrib/moderngpu/include/mgpudevice.cuh
================================================
/******************************************************************************
 * Copyright (c) 2013, NVIDIA CORPORATION.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *     * Redistributions of source code must retain the above copyright
 *       notice, this list of conditions and the following disclaimer.
 *     * Redistributions in binary form must reproduce the above copyright
 *       notice, this list of conditions and the following disclaimer in the
 *       documentation and/or other materials provided with the distribution.
 *     * Neither the name of the NVIDIA CORPORATION nor the
 *       names of its contributors may be used to endorse or promote products
 *       derived from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 ******************************************************************************/

/******************************************************************************
 *
 * Code and text by Sean Baxter, NVIDIA Research
 * See http://nvlabs.github.io/moderngpu for repository and documentation.
 *
 ******************************************************************************/

#pragma once

#include "mgpuenums.h"
#include "device/deviceutil.cuh"

namespace mgpu {

////////////////////////////////////////////////////////////////////////////////
// device/loadstore.cuh

// For 0 <= i < VT:
//		index = NT * i + tid;
//		reg[i] = data[index];
// Synchronize after load.
template<int NT, int VT, typename InputIt, typename T>
MGPU_DEVICE void DeviceSharedToReg(InputIt data, int tid, T* reg,
	bool sync = true);

// For 0 <= i < VT:
//		index = NT * i + tid;
//		if(index < count) reg[i] = data[index];
// No synchronize after load.
template<int NT, int VT, typename InputIt, typename T>
MGPU_DEVICE void DeviceGlobalToReg(int count, InputIt data, int tid,
	T* reg, bool sync = false);

template<int NT, int VT, typename InputIt, typename T>
MGPU_DEVICE void DeviceGlobalToRegDefault(int count, InputIt data, int tid,
	T* reg, T init, bool sync = false);

// For 0 <= i < VT:
//		index = NT * i + tid;
//		if(index < count) reg[i] = data[index];
// No synchronize after load.
template<int NT, int VT0, int VT1, typename InputIt, typename T>
MGPU_DEVICE void DeviceGlobalToReg(int count, InputIt data, int tid,
	T* reg, bool sync = false);

// For 0 <= i < VT:
//		index = NT * i + tid;
//		if(index < count) reg[i] = data[index];
// No synchronize after load.
template<int NT, int VT0, int VT1, typename InputIt, typename T>
MGPU_DEVICE void DeviceGlobalToRegDefault2(int count, InputIt data, int tid,
	T* reg, T init, bool sync = false);

// For 0 <= i < VT:
//		index = NT * i + tid;
//		if(index < count) reg[i] = data[index];
// No synchronize after load.
// No optimized code path for count < NV (smaller generated code).
template<int NT, int VT, typename InputIt, typename T>
MGPU_DEVICE void DeviceGlobalToRegLoop(int count, InputIt data, int tid,
	T* reg, bool sync = false);


// For 0 <= i < VT:
//		index = VT * tid + i.
//		if(index < count) reg[i] = data[index];
// No synchronize after load.
template<int NT, int VT, typename InputIt, typename T>
MGPU_DEVICE void DeviceGlobalToThread(int count, InputIt data, int tid,
	T* reg);

template<int NT, int VT, typename InputIt, typename T>
MGPU_DEVICE void DeviceGlobalToThreadDefault(int count, InputIt data, int tid,
	T* reg, T init);

// For 0 <= i < VT:
//		index = NT * i + tid;
//		if(index < count) data[index] = reg[i];
// Synchronize after load.
template<int NT, int VT, typename OutputIt, typename T>
MGPU_DEVICE void DeviceRegToShared(const T* reg, int tid, OutputIt dest,
	bool sync = true);

// For 0 <= i < VT:
//		index = NT * i + tid;
//		if(index < count) data[index] = reg[i];
// No synchronize after load.
template<int NT, int VT, typename OutputIt, typename T>
MGPU_DEVICE void DeviceRegToGlobal(int count, const T* reg, int tid,
	OutputIt dest, bool sync = false);

// For 0 <= index < count:
//		dest[index] = source[index];
// This function is intended to replace DeviceGlobalToShared in cases where
// count is much less than NT * VT.
template<int NT, typename InputIt, typename OutputIt>
MGPU_DEVICE void DeviceMemToMemLoop(int count, InputIt source, int tid,
	OutputIt dest, bool sync = true);

// For 0 <= index < count:
//		dest[index] = source[index];
// Synchronize after store.
template<int NT, int VT, typename T, typename OutputIt>
MGPU_DEVICE void DeviceSharedToGlobal(int count, const T* source, int tid,
	OutputIt dest, bool sync = true);

// For 0 <= index < count:
//		dest[index] = source[index];
// Synchronize after store.
template<int NT, int VT, typename InputIt, typename T>
MGPU_DEVICE void DeviceGlobalToShared(int count, InputIt source, int tid,
	T* dest, bool sync = true);

template<int NT, int VT0, int VT1, typename InputIt, typename T>
MGPU_DEVICE void DeviceGlobalToShared2(int count, InputIt source, int tid,
	T* dest, bool sync = true);

// For 0 <= index < count:
//		dest[index] = source[index];
// Synchronize after store.
// No optimized code path for count < NV (smaller generated code).
template<int NT, int VT, typename InputIt, typename T>
MGPU_DEVICE void DeviceGlobalToSharedLoop(int count, InputIt source, int tid,
	T* dest, bool sync = true);

template<int NT, int VT, typename InputIt, typename T>
MGPU_DEVICE void DeviceGlobalToSharedDefault(int count, InputIt source, int tid,
	T* dest, T init, bool sync = true);

template<int NT, int VT0, int VT1, typename InputIt, typename T>
MGPU_DEVICE void DeviceGlobalToSharedDefault2(int count, InputIt source,
	int tid, T* dest, T init, bool sync = true);

// For 0 <= index < count:
//		dest[index] = source[index];
// No synchronize.
template<int NT, int VT, typename InputIt, typename OutputIt>
MGPU_DEVICE void DeviceGlobalToGlobal(int count, InputIt source, int tid,
	OutputIt dest, bool sync = false);

// Transponse VT elements in NT threads (x) into thread-order registers (y)
// using only NT * VT / 2 elements of shared memory.
template<int NT, int VT, typename T>
MGPU_DEVICE void HalfSmemTranspose(const T* x, int tid, T* shared, T* y);

// For 0 <= i < VT:
//		index = NT * i + tid;
//		if(index < count)
//			gather = indices[index];
//			reg[i] = data[gather];
// Synchronize after load.
template<int NT, int VT, typename InputIt, typename T>
MGPU_DEVICE void DeviceGather(int count, InputIt data, int indices[VT],
	int tid, T* reg, bool sync = true);

template<int NT, int VT, typename InputIt, typename T>
MGPU_DEVICE void DeviceGatherDefault(int count, InputIt data, int indices[VT],
	int tid, T* reg, T identity, bool sync = true);

// For 0 <= i < VT:
//		index = NT * i + tid;
//		if(index < count)
//			scatter = indices[index];
//			data[scatter] = reg[i];
// Synchronize after store.
template<int NT, int VT, typename T, typename OutputIt>
MGPU_DEVICE void DeviceScatter(int count, const T* reg, int tid,
	int indices[VT], OutputIt data, bool sync = true);

// For 0 <= i < VT:
//		shared[VT * tid + i] = threadReg[i];
// Synchronize after store.
// Note this function moves data in THREAD ORDER.
// (DeviceRegToShared moves data in STRIDED ORDER).
template<int VT, typename T>
MGPU_DEVICE void DeviceThreadToShared(const T* threadReg, int tid, T* shared,
	bool sync = true);

// For 0 <= i < VT:
//		threadReg[i] = shared[VT * tid + i];
// Synchronize after load.
// Note this function moves data in THREAD ORDER.
// (DeviceSharedToReg moves data in STRIDED ORDER).
template<int VT, typename T>
MGPU_DEVICE void DeviceSharedToThread(const T* shared, int tid, T* threadReg,
	bool sync = true);

// For 0 <= index < aCount:
//		shared[index] = a_global[index];
// For 0 <= index < bCount:
//		shared[aCount + index] = b_global[index];
// VT0 is the lower-bound for predication-free execution:
//		If count >= NT * VT0, a predication-free branch is taken.
// VT1 is the upper-bound for loads:
//		NT * VT1 must >= aCount + bCount.

template<int NT, int VT0, int VT1, typename T>
MGPU_DEVICE void DeviceLoad2ToReg(const T* a_global, int aCount,
	const T* b_global, int bCount, int tid, T* reg, bool sync = false);

template<int NT, int VT0, int VT1, typename T>
MGPU_DEVICE void DeviceLoad2ToShared(const T* a_global, int aCount,
	const T* b_global, int bCount, int tid, T* shared, bool sync = true);

template<int NT, int VT0, int VT1, typename InputIt1, typename InputIt2,
	typename T>
MGPU_DEVICE void DeviceLoad2ToReg(InputIt1 a_global, int aCount,
	InputIt2 b_global, int bCount, int tid, T* reg, bool sync = false);

template<int NT, int VT0, int VT1, typename InputIt1, typename InputIt2,
	typename T>
MGPU_DEVICE void DeviceLoad2ToShared(InputIt1 a_global, int aCount,
	InputIt2 b_global, int bCount, int tid, T* shared, bool sync = true);

// For 0 <= i < VT
//		index = NT * i + tid;
//		if(index < count)
//			gather = indices_shared[index];
//			dest_global[index] = data_global[gather];
// Synchronize after load.
template<int NT, int VT, typename InputIt, typename OutputIt>
MGPU_DEVICE void DeviceGatherGlobalToGlobal(int count, InputIt data_global,
	const int* indices_shared, int tid, OutputIt dest_global,
	bool sync = true);

// For 0 <= i < VT
//		index = NT * i + tid
//		if(index < count)
//			gather = indices[index];
//			if(gather < aCount) data = a_global[gather];
//			else data = b_global[gather - aCount];
//			dest_global[index] = data;
// Synchronize after load.
template<int NT, int VT, typename InputIt1, typename InputIt2,
	typename T>
MGPU_DEVICE void DeviceTransferMergeValuesReg(int count, InputIt1 a_global,
	InputIt2 b_global, int bStart, const int* indices, int tid,
	T* reg, bool sync = false);

template<int NT, int VT, typename InputIt1, typename InputIt2,
	typename OutputIt>
MGPU_DEVICE void DeviceTransferMergeValuesShared(int count, InputIt1 a_global,
	InputIt2 b_global, int bStart, const int* indices_shared, int tid,
	OutputIt dest_global, bool sync = true);

template<int NT, int VT, typename T>
MGPU_DEVICE void DeviceTransferMergeValuesReg(int count, const T* a_global,
	const T* b_global, int bStart, const int* indices, int tid,
	T* reg, bool sync = false);

template<int NT, int VT, typename T, typename OutputIt>
MGPU_DEVICE void DeviceTransferMergeValuesShared(int count, const T* a_global,
	const T* b_global, int bStart, const int* indices_shared, int tid,
	OutputIt dest_global, bool sync = true);



} // namespace mgpu


#include "device/loadstore.cuh"
#include "device/ctasegscan.cuh"


================================================
FILE: 3rdparty/ctc_include/contrib/moderngpu/include/mgpuenums.h
================================================
/******************************************************************************
 * Copyright (c) 2013, NVIDIA CORPORATION.  All rights reserved.
 * 
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *     * Redistributions of source code must retain the above copyright
 *       notice, this list of conditions and the following disclaimer.
 *     * Redistributions in binary form must reproduce the above copyright
 *       notice, this list of conditions and the following disclaimer in the
 *       documentation and/or other materials provided with the distribution.
 *     * Neither the name of the NVIDIA CORPORATION nor the
 *       names of its contributors may be used to endorse or promote products
 *       derived from this software without specific prior written permission.
 * 
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
 * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 ******************************************************************************/

/******************************************************************************
 *
 * Code and text by Sean Baxter, NVIDIA Research
 * See http://nvlabs.github.io/moderngpu for repository and documentation.
 *
 ******************************************************************************/

#pragma once 

namespace mgpu {

enum MgpuBounds {
	MgpuBoundsLower,
	MgpuBoundsUpper
};

enum MgpuScanType {
	MgpuScanTypeExc,
	MgpuScanTypeInc
};

enum MgpuSearchType {
	MgpuSearchTypeNone,
	MgpuSearchTypeIndex,
	MgpuSearchTypeMatch,
	MgpuSearchTypeIndexMatch
};

enum MgpuJoinKind {
	MgpuJoinKindInner,
	MgpuJoinKindLeft,
	MgpuJoinKindRight,
	MgpuJoinKindOuter
};

enum MgpuSetOp {
	MgpuSetOpIntersection,
	MgpuSetOpUnion,
	MgpuSetOpDiff,
	MgpuSetOpSymDiff
};

} // namespace mgpu


================================================
FILE: 3rdparty/ctc_include/contrib/moderngpu/include/util/static.h
================================================
/******************************************************************************
 * Copyright (c) 2013, NVIDIA CORPORATION.  All rights reserved.
 * 
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *     * Redistributions of source code must retain the above copyright
 *       notice, this list of conditions and the following disclaimer.
 *     * Redistributions in binary form must reproduce the above copyright
 *       notice, this list of conditions and the following disclaimer in the
 *       documentation and/or other materials provided with the distribution.
 *     * Neither the name of the NVIDIA CORPORATION nor the
 *       names of its contributors may be used to endorse or promote products
 *       derived from this software without specific prior written permission.
 * 
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
 * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 ******************************************************************************/

/******************************************************************************
 *
 * Code and text by Sean Baxter, NVIDIA Research
 * See http://nvlabs.github.io/moderngpu for repository and documentation.
 *
 ******************************************************************************/

#pragma once

#include <functional>
#include <iterator>
#include <cfloat>
#include <typeinfo>
#include <vector>
#include <list>
#include <map>
#include <algorithm>
#include <cassert>
#include <memory>
#include <cmath>
#include <cstdio>
#include <cstdlib>

#ifndef MGPU_MIN
#define MGPU_MIN(x, y) (((x) <= (y)) ? (x) : (y))
#define MGPU_MAX(x, y) (((x) >= (y)) ? (x) : (y))
#define MGPU_MAX0(x) (((x) >= 0) ? (x) : 0)
#define MGPU_ABS(x) (((x) >= 0) ? (x) : (-x))

#define MGPU_DIV_UP(x, y) (((x) + (y) - 1) / (y))
#define MGPU_DIV_ROUND(x, y) (((x) + (y) / 2) / (y))
#define MGPU_ROUND_UP(x, y) ((y) * MGPU_DIV_UP(x, y))
#define MGPU_SHIFT_DIV_UP(x, y) (((x) + ((1<< (y)) - 1))>> y)
#define MGPU_ROUND_UP_POW2(x, y) (((x) + (y) - 1) & ~((y) - 1))
#define MGPU_ROUND_DOWN_POW2(x, y) ((x) & ~((y) - 1))
#define MGPU_IS_POW_2(x) (0 == ((x) & ((x) - 1)))

#endif // MGPU_MIN

namespace mgpu {


typedef unsigned char byte;

typedef unsigned int uint;
typedef signed short int16;

typedef unsigned short ushort;
typedef unsigned short uint16;

typedef long long int64;
typedef unsigned long long uint64;

// IsPow2<X>::value is true if X is a power of 2.
template<int X> struct sIsPow2 {
Download .txt
gitextract_zlms863u/

├── .asf.yaml
├── .clang-format
├── .clang-tidy
├── .cmakelintrc
├── .codecov.yml
├── .git-blame-ignore-revs
├── .gitattributes
├── .github/
│   ├── ISSUE_TEMPLATE/
│   │   ├── bug_report.md
│   │   ├── config.yml
│   │   ├── feature_request.md
│   │   ├── flaky_test.md
│   │   └── rfc.md
│   ├── PULL_REQUEST_TEMPLATE.md
│   └── workflows/
│       ├── greetings.yml
│       ├── license_check.yml
│       ├── link_check.yml
│       ├── os_x_mklbuild.yml
│       └── os_x_staticbuild.yml
├── .gitignore
├── .gitmodules
├── .licenserc.yaml
├── .mxnet_root
├── 3rdparty/
│   ├── ctc_include/
│   │   ├── LICENSE
│   │   ├── contrib/
│   │   │   └── moderngpu/
│   │   │       ├── LICENSE
│   │   │       └── include/
│   │   │           ├── device/
│   │   │           │   ├── ctaloadbalance.cuh
│   │   │           │   ├── ctamerge.cuh
│   │   │           │   ├── ctascan.cuh
│   │   │           │   ├── ctasearch.cuh
│   │   │           │   ├── ctasegreduce.cuh
│   │   │           │   ├── ctasegscan.cuh
│   │   │           │   ├── ctasegsort.cuh
│   │   │           │   ├── ctasortedsearch.cuh
│   │   │           │   ├── devicetypes.cuh
│   │   │           │   ├── deviceutil.cuh
│   │   │           │   ├── intrinsics.cuh
│   │   │           │   ├── loadstore.cuh
│   │   │           │   ├── serialsets.cuh
│   │   │           │   └── sortnetwork.cuh
│   │   │           ├── mgpudevice.cuh
│   │   │           ├── mgpuenums.h
│   │   │           └── util/
│   │   │               └── static.h
│   │   └── detail/
│   │       ├── cpu_ctc.h
│   │       ├── ctc_helper.h
│   │       ├── gpu_ctc.h
│   │       ├── gpu_ctc_kernels.h
│   │       └── hostdevice.h
│   ├── miniz/
│   │   ├── miniz.c
│   │   └── miniz.h
│   └── mshadow/
│       ├── .gitignore
│       ├── .travis.yml
│       ├── CHANGES.md
│       ├── CMakeLists.txt
│       ├── LICENSE
│       ├── README.md
│       ├── cmake/
│       │   └── AutoDetectF16C.cmake
│       ├── doc/
│       │   ├── Doxyfile
│       │   ├── README.md
│       │   └── mkdoc.sh
│       ├── guide/
│       │   ├── .gitignore
│       │   ├── Makefile
│       │   ├── README.md
│       │   ├── basic.cpp
│       │   ├── basic_stream.cu
│       │   ├── defop.cpp
│       │   ├── exp-template/
│       │   │   ├── .gitignore
│       │   │   ├── Makefile
│       │   │   └── README.md
│       │   ├── mshadow-ps/
│       │   │   ├── .gitignore
│       │   │   ├── Makefile
│       │   │   ├── README.md
│       │   │   ├── dbstr.h
│       │   │   ├── dist_async_sum-inl.h
│       │   │   ├── dist_async_sum.cpp
│       │   │   ├── local.sh
│       │   │   ├── local_sum-inl.h
│       │   │   ├── local_sum.cpp
│       │   │   └── local_sum.cu
│       │   └── neuralnet/
│       │       ├── Makefile
│       │       ├── README.md
│       │       ├── convnet.cu
│       │       ├── nnet.cu
│       │       ├── nnet_ps.cu
│       │       └── util.h
│       ├── make/
│       │   ├── README.md
│       │   └── mshadow.mk
│       ├── mshadow/
│       │   ├── README.md
│       │   ├── base.h
│       │   ├── bfloat.h
│       │   ├── cuda/
│       │   │   ├── reduce.cuh
│       │   │   └── tensor_gpu-inl.cuh
│       │   ├── dot_engine-inl.h
│       │   ├── expr_engine-inl.h
│       │   ├── expr_scalar-inl.h
│       │   ├── expression.h
│       │   ├── extension/
│       │   │   ├── broadcast.h
│       │   │   ├── broadcast_with_axis.h
│       │   │   ├── channel_pool.h
│       │   │   ├── channel_unpool.h
│       │   │   ├── choose.h
│       │   │   ├── complex.h
│       │   │   ├── concat.h
│       │   │   ├── crop.h
│       │   │   ├── fill.h
│       │   │   ├── flip.h
│       │   │   ├── implicit_gemm.h
│       │   │   ├── mask.h
│       │   │   ├── mirror.h
│       │   │   ├── one_hot.h
│       │   │   ├── pack_col2patch.h
│       │   │   ├── pad.h
│       │   │   ├── range.h
│       │   │   ├── reduce_with_axis.h
│       │   │   ├── reduceto1d.h
│       │   │   ├── reshape.h
│       │   │   ├── slice.h
│       │   │   ├── slice_ex.h
│       │   │   ├── spatial_pool.h
│       │   │   ├── spatial_unpool.h
│       │   │   ├── spatial_upsampling_nearest.h
│       │   │   ├── swapaxis.h
│       │   │   ├── take.h
│       │   │   ├── take_grad.h
│       │   │   ├── transpose.h
│       │   │   └── unpack_patch2col.h
│       │   ├── extension.h
│       │   ├── half.h
│       │   ├── io.h
│       │   ├── packet/
│       │   │   ├── plain-inl.h
│       │   │   └── sse-inl.h
│       │   ├── packet-inl.h
│       │   ├── random.h
│       │   ├── stream_gpu-inl.h
│       │   ├── tensor.h
│       │   ├── tensor_container.h
│       │   ├── tensor_cpu-inl.h
│       │   └── tensor_gpu-inl.h
│       ├── mshadow-ps/
│       │   ├── .gitignore
│       │   ├── README.md
│       │   ├── mshadow_ps.h
│       │   ├── ps_dist-inl.h
│       │   ├── ps_local-inl.h
│       │   ├── ps_rabit-inl.h
│       │   ├── thread.h
│       │   └── thread_util.h
│       ├── scripts/
│       │   └── travis_script.sh
│       └── test/
│           ├── Makefile
│           ├── pairtest.cu
│           ├── pool.cu
│           ├── reshape.cu
│           ├── test.cu
│           ├── test.h
│           └── unpack.cu
├── CMakeLists.txt
├── CODEOWNERS
├── CODE_OF_CONDUCT.md
├── CONTRIBUTORS.md
├── DNNL_README.md
├── LICENSE
├── NEWS.md
├── NOTICE
├── README.md
├── SECURITY.md
├── benchmark/
│   ├── __init__.py
│   ├── opperf/
│   │   ├── README.md
│   │   ├── __init__.py
│   │   ├── custom_operations/
│   │   │   ├── __init__.py
│   │   │   └── custom_operations.py
│   │   ├── nd_operations/
│   │   │   ├── README.md
│   │   │   ├── __init__.py
│   │   │   ├── array_manipulation_operators.py
│   │   │   ├── array_rearrange.py
│   │   │   ├── binary_operators.py
│   │   │   ├── gemm_operators.py
│   │   │   ├── indexing_routines.py
│   │   │   ├── linalg_operators.py
│   │   │   ├── misc_operators.py
│   │   │   ├── nn_activation_operators.py
│   │   │   ├── nn_basic_operators.py
│   │   │   ├── nn_conv_operators.py
│   │   │   ├── nn_loss_operators.py
│   │   │   ├── nn_optimizer_operators.py
│   │   │   ├── random_sampling_operators.py
│   │   │   ├── reduction_operators.py
│   │   │   ├── sorting_searching_operators.py
│   │   │   └── unary_operators.py
│   │   ├── opperf.py
│   │   ├── results/
│   │   │   ├── mxnet_operator_benchmark_results_cpu.md
│   │   │   └── mxnet_operator_benchmark_results_gpu.md
│   │   ├── rules/
│   │   │   ├── __init__.py
│   │   │   └── default_params.py
│   │   └── utils/
│   │       ├── __init__.py
│   │       ├── benchmark_operators_pytest.py
│   │       ├── benchmark_utils.py
│   │       ├── common_utils.py
│   │       ├── ndarray_utils.py
│   │       ├── op_registry_utils.py
│   │       └── profiler_utils.py
│   └── python/
│       ├── control_flow/
│       │   └── rnn.py
│       ├── dnnl/
│       │   ├── fc_add.py
│       │   ├── run.sh
│       │   └── run_per_thread.sh
│       ├── einsum/
│       │   └── benchmark_einsum.py
│       ├── ffi/
│       │   └── benchmark_ffi.py
│       ├── metric/
│       │   └── benchmark_metric.py
│       ├── quantization/
│       │   └── benchmark_op.py
│       ├── sparse/
│       │   ├── cast_storage.py
│       │   ├── dot.py
│       │   ├── memory_benchmark.py
│       │   ├── sparse_op.py
│       │   ├── updater.py
│       │   └── util.py
│       └── tvmop/
│           └── benchmark_tvmop.py
├── cd/
│   ├── Jenkinsfile_cd_pipeline
│   ├── Jenkinsfile_release_job
│   ├── Jenkinsfile_utils.groovy
│   ├── README.md
│   ├── mxnet_lib/
│   │   ├── Jenkins_pipeline.groovy
│   │   └── mxnet_lib_pipeline.groovy
│   ├── python/
│   │   ├── docker/
│   │   │   ├── Dockerfile
│   │   │   ├── Dockerfile.test
│   │   │   ├── Jenkins_pipeline.groovy
│   │   │   ├── python_images.sh
│   │   │   └── test_python_image.sh
│   │   └── pypi/
│   │       ├── Jenkins_pipeline.groovy
│   │       ├── README.md
│   │       ├── pypi_package.sh
│   │       └── pypi_publish.py
│   └── utils/
│       ├── artifact_repository.md
│       ├── artifact_repository.py
│       ├── docker_tag.sh
│       ├── mxnet_base_image.sh
│       └── test_artifact_repository.py
├── ci/
│   ├── Jenkinsfile_docker_cache
│   ├── Jenkinsfile_utils.groovy
│   ├── README.md
│   ├── __init__.py
│   ├── build.py
│   ├── build_windows.py
│   ├── dev_menu.py
│   ├── docker/
│   │   ├── Dockerfile.build.android
│   │   ├── Dockerfile.build.arm
│   │   ├── Dockerfile.build.centos7
│   │   ├── Dockerfile.build.jetson
│   │   ├── Dockerfile.build.ubuntu
│   │   ├── Dockerfile.build.ubuntu_cpu_jekyll
│   │   ├── Dockerfile.publish.test.centos7
│   │   ├── Dockerfile.test.arm
│   │   ├── docker-compose.yml
│   │   ├── install/
│   │   │   ├── deb_ubuntu_ccache.sh
│   │   │   ├── docker_filepermissions.sh
│   │   │   ├── requirements
│   │   │   └── ubuntu_adduser.sh
│   │   ├── runtime_functions.sh
│   │   └── toolchains/
│   │       ├── aarch64-linux-gnu-toolchain.cmake
│   │       └── arm-linux-gnueabihf-toolchain.cmake
│   ├── docker_login.py
│   ├── jenkins/
│   │   ├── Jenkins_steps.groovy
│   │   ├── Jenkinsfile_centos_cpu
│   │   ├── Jenkinsfile_centos_gpu
│   │   ├── Jenkinsfile_clang
│   │   ├── Jenkinsfile_edge
│   │   ├── Jenkinsfile_full
│   │   ├── Jenkinsfile_miscellaneous
│   │   ├── Jenkinsfile_sanity
│   │   ├── Jenkinsfile_tools
│   │   ├── Jenkinsfile_unix_cpu
│   │   ├── Jenkinsfile_unix_gpu
│   │   ├── Jenkinsfile_website_beta
│   │   ├── Jenkinsfile_website_full
│   │   ├── Jenkinsfile_website_full_pr
│   │   ├── Jenkinsfile_website_jekyll_docs
│   │   ├── Jenkinsfile_website_mxnet_build
│   │   ├── Jenkinsfile_website_nightly
│   │   ├── Jenkinsfile_website_python_docs
│   │   ├── Jenkinsfile_website_version_artifacts
│   │   ├── Jenkinsfile_windows_cpu
│   │   └── Jenkinsfile_windows_gpu
│   ├── logging.conf
│   ├── other/
│   │   └── ci_deploy_doc.sh
│   ├── publish/
│   │   ├── Jenkinsfile
│   │   ├── README.md
│   │   ├── python/
│   │   │   └── build.sh
│   │   ├── scala/
│   │   │   ├── build.sh
│   │   │   ├── buildkey.py
│   │   │   ├── deploy.sh
│   │   │   ├── fullDeploy.sh
│   │   │   └── test.sh
│   │   └── website/
│   │       ├── README.md
│   │       ├── beta-deploy.sh
│   │       ├── deploy.sh
│   │       └── publish_artifacts.sh
│   ├── test_docker_login.py
│   ├── util.py
│   └── windows/
│       ├── test_py3_cpu.ps1
│       └── test_py3_gpu.ps1
├── cmake/
│   ├── BuildCythonModules.cmake
│   ├── BuildTVM.cmake
│   ├── ChooseBlas.cmake
│   ├── Modules/
│   │   ├── FindAccelerate.cmake
│   │   ├── FindAtlas.cmake
│   │   ├── FindCUDNN.cmake
│   │   ├── FindCUTENSOR.cmake
│   │   ├── FindGperftools.cmake
│   │   ├── FindJeMalloc.cmake
│   │   ├── FindNCCL.cmake
│   │   ├── FindNVML.cmake
│   │   ├── FindNVTX.cmake
│   │   └── FindOpenBLAS.cmake
│   ├── Utils.cmake
│   ├── libmxnet.sym
│   └── upstream/
│       ├── FindBLAS.cmake
│       ├── FindCUDAToolkit.cmake
│       └── select_compute_arch.cmake
├── config/
│   ├── darwin.cmake
│   ├── distribution/
│   │   ├── darwin_cpu.cmake
│   │   ├── darwin_cpu_mkl.cmake
│   │   ├── darwin_native.cmake
│   │   ├── linux_cpu.cmake
│   │   ├── linux_cpu_mkl.cmake
│   │   ├── linux_cu100.cmake
│   │   ├── linux_cu101.cmake
│   │   ├── linux_cu102.cmake
│   │   ├── linux_cu110.cmake
│   │   ├── linux_cu112.cmake
│   │   ├── linux_cu92.cmake
│   │   └── linux_native.cmake
│   ├── linux.cmake
│   └── linux_gpu.cmake
├── conftest.py
├── contrib/
│   └── tvmop/
│       ├── __init__.py
│       ├── basic/
│       │   ├── __init__.py
│       │   └── ufunc.py
│       ├── compile.py
│       ├── core/
│       │   ├── __init__.py
│       │   ├── fromnumeric.py
│       │   ├── multiarray.py
│       │   └── umath.py
│       ├── opdef.py
│       ├── space.py
│       └── utils.py
├── cpp-package/
│   ├── CMakeLists.txt
│   ├── README.md
│   ├── example/
│   │   ├── CMakeLists.txt
│   │   ├── README.md
│   │   ├── alexnet.cpp
│   │   ├── charRNN.cpp
│   │   ├── feature_extract/
│   │   │   ├── README.md
│   │   │   ├── feature_extract.cpp
│   │   │   ├── prepare_data_with_opencv.cpp
│   │   │   └── run.sh
│   │   ├── get_data.sh
│   │   ├── googlenet.cpp
│   │   ├── inception_bn.cpp
│   │   ├── inference/
│   │   │   ├── CMakeLists.txt
│   │   │   ├── README.md
│   │   │   ├── imagenet_inference.cpp
│   │   │   ├── multi_threaded_inference/
│   │   │   │   ├── get_model.py
│   │   │   │   ├── multi_threaded_inference.cc
│   │   │   │   └── unit_test_multi_threaded_inference.sh
│   │   │   ├── sentiment_analysis_rnn.cpp
│   │   │   ├── unit_test_imagenet_inference.sh
│   │   │   └── unit_test_sentiment_analysis_rnn.sh
│   │   ├── lenet.cpp
│   │   ├── lenet_with_mxdataiter.cpp
│   │   ├── mlp.cpp
│   │   ├── mlp_cpu.cpp
│   │   ├── mlp_csv.cpp
│   │   ├── mlp_gpu.cpp
│   │   ├── mnist_to_csv.py
│   │   ├── resnet.cpp
│   │   ├── run_lenet_with_mxdataiter.sh
│   │   ├── test_kvstore.cpp
│   │   ├── test_ndarray_copy.cpp
│   │   ├── test_optimizer.cpp
│   │   ├── test_regress_label.cpp
│   │   ├── test_score.cpp
│   │   ├── unittests/
│   │   │   └── unit_test_mlp_csv.sh
│   │   └── utils.h
│   ├── include/
│   │   └── mxnet-cpp/
│   │       ├── .gitignore
│   │       ├── CPPLINT.cfg
│   │       ├── MxNetCpp.h
│   │       ├── base.h
│   │       ├── contrib.h
│   │       ├── executor.h
│   │       ├── executor.hpp
│   │       ├── initializer.h
│   │       ├── io.h
│   │       ├── io.hpp
│   │       ├── kvstore.h
│   │       ├── kvstore.hpp
│   │       ├── lr_scheduler.h
│   │       ├── metric.h
│   │       ├── model.h
│   │       ├── ndarray.h
│   │       ├── ndarray.hpp
│   │       ├── op_map.h
│   │       ├── op_suppl.h
│   │       ├── op_util.h
│   │       ├── operator.h
│   │       ├── operator.hpp
│   │       ├── optimizer.h
│   │       ├── optimizer.hpp
│   │       ├── shape.h
│   │       ├── symbol.h
│   │       └── symbol.hpp
│   ├── scripts/
│   │   ├── OpWrapperGenerator.py
│   │   └── lint.py
│   └── tests/
│       └── ci_test.sh
├── doap.rdf
├── docker/
│   ├── .gitignore
│   ├── Dockerfiles/
│   │   ├── Dockerfile.in.julia
│   │   ├── Dockerfile.in.lib.cpu
│   │   ├── Dockerfile.in.lib.gpu
│   │   ├── Dockerfile.in.perl
│   │   ├── Dockerfile.in.python
│   │   ├── Dockerfile.in.r-lang
│   │   └── Dockerfile.in.scala
│   ├── README.md
│   ├── docker-python/
│   │   ├── README.md
│   │   ├── build_python_dockerfile.sh
│   │   └── test_mxnet.py
│   ├── install/
│   │   ├── cpp.sh
│   │   ├── julia.sh
│   │   ├── perl.sh
│   │   ├── python.sh
│   │   ├── r.sh
│   │   └── scala.sh
│   ├── run.sh
│   └── tool.sh
├── docs/
│   ├── .dockerignore
│   ├── .gitignore
│   ├── README.md
│   ├── cpp_docs/
│   │   ├── Doxyfile
│   │   └── Makefile
│   ├── python_docs/
│   │   ├── README.md
│   │   ├── _static/
│   │   │   ├── autodoc.js
│   │   │   ├── feedback.css
│   │   │   ├── matomo_analytics.js
│   │   │   └── mxnet.css
│   │   ├── python/
│   │   │   ├── .gitignore
│   │   │   ├── Makefile
│   │   │   ├── Makefile_sphinx
│   │   │   ├── api/
│   │   │   │   ├── autograd/
│   │   │   │   │   └── index.rst
│   │   │   │   ├── contrib/
│   │   │   │   │   ├── index.rst
│   │   │   │   │   ├── io/
│   │   │   │   │   │   └── index.rst
│   │   │   │   │   ├── ndarray/
│   │   │   │   │   │   └── index.rst
│   │   │   │   │   ├── onnx/
│   │   │   │   │   │   └── index.rst
│   │   │   │   │   ├── quantization/
│   │   │   │   │   │   └── index.rst
│   │   │   │   │   ├── symbol/
│   │   │   │   │   │   └── index.rst
│   │   │   │   │   ├── tensorboard/
│   │   │   │   │   │   └── index.rst
│   │   │   │   │   ├── tensorrt/
│   │   │   │   │   │   └── index.rst
│   │   │   │   │   └── text/
│   │   │   │   │       └── index.rst
│   │   │   │   ├── device/
│   │   │   │   │   └── index.rst
│   │   │   │   ├── engine/
│   │   │   │   │   └── index.rst
│   │   │   │   ├── executor/
│   │   │   │   │   └── index.rst
│   │   │   │   ├── gluon/
│   │   │   │   │   ├── block.rst
│   │   │   │   │   ├── constant.rst
│   │   │   │   │   ├── contrib/
│   │   │   │   │   │   └── index.rst
│   │   │   │   │   ├── hybrid_block.rst
│   │   │   │   │   ├── index.rst
│   │   │   │   │   ├── loss/
│   │   │   │   │   │   └── index.rst
│   │   │   │   │   ├── metric/
│   │   │   │   │   │   └── index.rst
│   │   │   │   │   ├── model_zoo/
│   │   │   │   │   │   └── index.rst
│   │   │   │   │   ├── nn/
│   │   │   │   │   │   └── index.rst
│   │   │   │   │   ├── parameter.rst
│   │   │   │   │   ├── rnn/
│   │   │   │   │   │   └── index.rst
│   │   │   │   │   ├── symbol_block.rst
│   │   │   │   │   ├── trainer.rst
│   │   │   │   │   └── utils/
│   │   │   │   │       └── index.rst
│   │   │   │   ├── index.rst
│   │   │   │   ├── initializer/
│   │   │   │   │   └── index.rst
│   │   │   │   ├── kvstore/
│   │   │   │   │   └── index.rst
│   │   │   │   ├── kvstore_server/
│   │   │   │   │   └── index.rst
│   │   │   │   ├── legacy/
│   │   │   │   │   ├── callback/
│   │   │   │   │   │   └── index.rst
│   │   │   │   │   ├── image/
│   │   │   │   │   │   └── index.rst
│   │   │   │   │   ├── index.rst
│   │   │   │   │   ├── io/
│   │   │   │   │   │   └── index.rst
│   │   │   │   │   ├── ndarray/
│   │   │   │   │   │   ├── contrib/
│   │   │   │   │   │   │   └── index.rst
│   │   │   │   │   │   ├── image/
│   │   │   │   │   │   │   └── index.rst
│   │   │   │   │   │   ├── index.rst
│   │   │   │   │   │   ├── linalg/
│   │   │   │   │   │   │   └── index.rst
│   │   │   │   │   │   ├── ndarray.rst
│   │   │   │   │   │   ├── op/
│   │   │   │   │   │   │   └── index.rst
│   │   │   │   │   │   ├── random/
│   │   │   │   │   │   │   └── index.rst
│   │   │   │   │   │   ├── register/
│   │   │   │   │   │   │   └── index.rst
│   │   │   │   │   │   ├── sparse/
│   │   │   │   │   │   │   └── index.rst
│   │   │   │   │   │   └── utils/
│   │   │   │   │   │       └── index.rst
│   │   │   │   │   ├── recordio/
│   │   │   │   │   │   └── index.rst
│   │   │   │   │   ├── symbol/
│   │   │   │   │   │   ├── contrib/
│   │   │   │   │   │   │   └── index.rst
│   │   │   │   │   │   ├── image/
│   │   │   │   │   │   │   └── index.rst
│   │   │   │   │   │   ├── index.rst
│   │   │   │   │   │   ├── linalg/
│   │   │   │   │   │   │   └── index.rst
│   │   │   │   │   │   ├── op/
│   │   │   │   │   │   │   └── index.rst
│   │   │   │   │   │   ├── random/
│   │   │   │   │   │   │   └── index.rst
│   │   │   │   │   │   ├── register/
│   │   │   │   │   │   │   └── index.rst
│   │   │   │   │   │   ├── sparse/
│   │   │   │   │   │   │   └── index.rst
│   │   │   │   │   │   └── symbol.rst
│   │   │   │   │   └── visualization/
│   │   │   │   │       └── index.rst
│   │   │   │   ├── lr_scheduler/
│   │   │   │   │   └── index.rst
│   │   │   │   ├── np/
│   │   │   │   │   ├── arrays.indexing.rst
│   │   │   │   │   ├── arrays.ndarray.rst
│   │   │   │   │   ├── arrays.rst
│   │   │   │   │   ├── index.rst
│   │   │   │   │   ├── random/
│   │   │   │   │   │   └── index.rst
│   │   │   │   │   ├── routines.array-creation.rst
│   │   │   │   │   ├── routines.array-manipulation.rst
│   │   │   │   │   ├── routines.io.rst
│   │   │   │   │   ├── routines.linalg.rst
│   │   │   │   │   ├── routines.math.rst
│   │   │   │   │   ├── routines.rst
│   │   │   │   │   ├── routines.sort.rst
│   │   │   │   │   └── routines.statistics.rst
│   │   │   │   ├── npx/
│   │   │   │   │   └── index.rst
│   │   │   │   ├── optimizer/
│   │   │   │   │   └── index.rst
│   │   │   │   ├── profiler/
│   │   │   │   │   └── index.rst
│   │   │   │   ├── rtc/
│   │   │   │   │   └── index.rst
│   │   │   │   ├── runtime/
│   │   │   │   │   └── index.rst
│   │   │   │   ├── test_utils/
│   │   │   │   │   └── index.rst
│   │   │   │   └── util/
│   │   │   │       └── index.rst
│   │   │   ├── index.rst
│   │   │   ├── scripts/
│   │   │   │   ├── conf.py
│   │   │   │   ├── md2ipynb.py
│   │   │   │   └── process_rst.py
│   │   │   └── tutorials/
│   │   │       ├── deploy/
│   │   │       │   ├── export/
│   │   │       │   │   ├── index.rst
│   │   │       │   │   └── onnx.md
│   │   │       │   ├── index.rst
│   │   │       │   ├── inference/
│   │   │       │   │   ├── cpp.rst
│   │   │       │   │   ├── image_classification_jetson.md
│   │   │       │   │   └── index.rst
│   │   │       │   └── run-on-aws/
│   │   │       │       ├── cloud.md
│   │   │       │       ├── index.rst
│   │   │       │       ├── use_ec2.rst
│   │   │       │       └── use_sagemaker.rst
│   │   │       ├── extend/
│   │   │       │   ├── customop.md
│   │   │       │   └── index.rst
│   │   │       ├── getting-started/
│   │   │       │   ├── crash-course/
│   │   │       │   │   ├── 0-introduction.md
│   │   │       │   │   ├── 1-nparray.md
│   │   │       │   │   ├── 2-create-nn.md
│   │   │       │   │   ├── 3-autograd.md
│   │   │       │   │   ├── 4-components.md
│   │   │       │   │   ├── 5-datasets.md
│   │   │       │   │   ├── 6-train-nn.md
│   │   │       │   │   ├── 7-use-gpus.md
│   │   │       │   │   ├── index.rst
│   │   │       │   │   └── prepare_dataset.py
│   │   │       │   ├── gluon_from_experiment_to_deployment.md
│   │   │       │   ├── gluon_migration_guide.md
│   │   │       │   ├── index.rst
│   │   │       │   ├── logistic_regression_explained.md
│   │   │       │   └── to-mxnet/
│   │   │       │       ├── index.rst
│   │   │       │       └── pytorch.md
│   │   │       ├── index.rst
│   │   │       ├── packages/
│   │   │       │   ├── autograd/
│   │   │       │   │   └── index.md
│   │   │       │   ├── gluon/
│   │   │       │   │   ├── blocks/
│   │   │       │   │   │   ├── activations/
│   │   │       │   │   │   │   └── activations.md
│   │   │       │   │   │   ├── custom-layer.md
│   │   │       │   │   │   ├── hybridize.md
│   │   │       │   │   │   ├── index.rst
│   │   │       │   │   │   ├── init.md
│   │   │       │   │   │   ├── naming.md
│   │   │       │   │   │   ├── nn.md
│   │   │       │   │   │   ├── parameters.md
│   │   │       │   │   │   └── save_load_params.md
│   │   │       │   │   ├── image/
│   │   │       │   │   │   ├── index.rst
│   │   │       │   │   │   ├── info_gan.md
│   │   │       │   │   │   └── mnist.md
│   │   │       │   │   ├── index.rst
│   │   │       │   │   ├── loss/
│   │   │       │   │   │   ├── custom-loss.md
│   │   │       │   │   │   ├── index.rst
│   │   │       │   │   │   ├── kl_divergence.md
│   │   │       │   │   │   └── loss.md
│   │   │       │   │   ├── text/
│   │   │       │   │   │   ├── gnmt.rst
│   │   │       │   │   │   ├── index.rst
│   │   │       │   │   │   └── transformer.rst
│   │   │       │   │   └── training/
│   │   │       │   │       ├── fit_api_tutorial.md
│   │   │       │   │       ├── index.rst
│   │   │       │   │       ├── learning_rates/
│   │   │       │   │       │   ├── index.rst
│   │   │       │   │       │   ├── learning_rate_finder.md
│   │   │       │   │       │   ├── learning_rate_schedules.md
│   │   │       │   │       │   └── learning_rate_schedules_advanced.md
│   │   │       │   │       ├── normalization/
│   │   │       │   │       │   └── index.md
│   │   │       │   │       └── trainer.md
│   │   │       │   ├── index.rst
│   │   │       │   ├── kvstore/
│   │   │       │   │   ├── index.rst
│   │   │       │   │   └── kvstore.md
│   │   │       │   ├── legacy/
│   │   │       │   │   ├── index.rst
│   │   │       │   │   └── ndarray/
│   │   │       │   │       ├── 01-ndarray-intro.md
│   │   │       │   │       ├── 02-ndarray-operations.md
│   │   │       │   │       ├── 03-ndarray-contexts.md
│   │   │       │   │       ├── gotchas_numpy_in_mxnet.md
│   │   │       │   │       ├── index.rst
│   │   │       │   │       └── sparse/
│   │   │       │   │           ├── csr.md
│   │   │       │   │           ├── index.rst
│   │   │       │   │           └── row_sparse.md
│   │   │       │   ├── np/
│   │   │       │   │   ├── cheat-sheet.md
│   │   │       │   │   ├── index.rst
│   │   │       │   │   └── np-vs-numpy.md
│   │   │       │   ├── onnx/
│   │   │       │   │   ├── fine_tuning_gluon.md
│   │   │       │   │   ├── index.rst
│   │   │       │   │   └── inference_on_onnx_model.md
│   │   │       │   ├── optimizer/
│   │   │       │   │   └── index.md
│   │   │       │   └── viz/
│   │   │       │       └── index.rst
│   │   │       └── performance/
│   │   │           ├── backend/
│   │   │           │   ├── amp.md
│   │   │           │   ├── dnnl/
│   │   │           │   │   ├── dnnl_quantization.md
│   │   │           │   │   ├── dnnl_quantization_inc.md
│   │   │           │   │   ├── dnnl_readme.md
│   │   │           │   │   └── index.rst
│   │   │           │   ├── index.rst
│   │   │           │   ├── profiler.md
│   │   │           │   └── tvm.rst
│   │   │           ├── compression/
│   │   │           │   ├── index.rst
│   │   │           │   └── int8.rst
│   │   │           └── index.rst
│   │   ├── requirements
│   │   └── themes/
│   │       ├── .babelrc
│   │       ├── .circleci/
│   │       │   └── config.yml
│   │       ├── .gitignore
│   │       ├── .sassrc
│   │       └── mx-theme/
│   │           ├── LICENSE
│   │           ├── MANIFEST.in
│   │           ├── README.md
│   │           ├── mxtheme/
│   │           │   ├── __init__.py
│   │           │   ├── card.py
│   │           │   ├── drawer.html
│   │           │   ├── feedback.html
│   │           │   ├── footer.html
│   │           │   ├── header.html
│   │           │   ├── header_search.html
│   │           │   ├── header_sourcelink.html
│   │           │   ├── header_top.html
│   │           │   ├── layout.html
│   │           │   ├── localtoc.html
│   │           │   ├── relations.html
│   │           │   ├── search.html
│   │           │   ├── static/
│   │           │   │   ├── fontawesome/
│   │           │   │   │   └── all.css
│   │           │   │   ├── fonts.css
│   │           │   │   ├── sphinx_materialdesign_theme.css
│   │           │   │   └── sphinx_materialdesign_theme.js
│   │           │   └── theme.conf
│   │           ├── setup.py
│   │           └── src/
│   │               ├── js/
│   │               │   ├── adjust-height.js
│   │               │   ├── feedback.js
│   │               │   ├── scrollspy.js
│   │               │   └── sphinx_materialdesign_theme.js
│   │               └── scss/
│   │                   ├── _root.scss
│   │                   ├── _variables.scss
│   │                   ├── admonitions/
│   │                   │   └── _admonitions.scss
│   │                   ├── blockquote/
│   │                   │   └── _blockquote.scss
│   │                   ├── card/
│   │                   │   └── _card.scss
│   │                   ├── code/
│   │                   │   └── _code.scss
│   │                   ├── downloadlink/
│   │                   │   └── _downloadlink.scss
│   │                   ├── drawer/
│   │                   │   └── _drawer.scss
│   │                   ├── fonts/
│   │                   │   └── _material-icons.scss
│   │                   ├── footer/
│   │                   │   └── _footer.scss
│   │                   ├── grid/
│   │                   │   └── _simplegrid.scss
│   │                   ├── header/
│   │                   │   └── _header.scss
│   │                   ├── headerings/
│   │                   │   └── _headerings.scss
│   │                   ├── layout/
│   │                   │   └── _layout.scss
│   │                   ├── lists/
│   │                   │   └── _lists.scss
│   │                   ├── search/
│   │                   │   └── _search.scss
│   │                   ├── sphinx_materialdesign_theme.scss
│   │                   ├── tables/
│   │                   │   └── _tables.scss
│   │                   └── toc/
│   │                       ├── _globaltoc.scss
│   │                       ├── _localtoc.scss
│   │                       └── _toctree.scss
│   ├── static_site/
│   │   ├── .gitignore
│   │   ├── .nojekyll
│   │   ├── Makefile
│   │   ├── README.md
│   │   └── src/
│   │       ├── .asf.yaml
│   │       ├── .gitignore
│   │       ├── .htaccess
│   │       ├── .nojekyll
│   │       ├── 404.html
│   │       ├── Gemfile
│   │       ├── _config.yml
│   │       ├── _config_beta.yml
│   │       ├── _config_prod.yml
│   │       ├── _includes/
│   │       │   ├── callout.html
│   │       │   ├── disqus_comments.html
│   │       │   ├── feedback.html
│   │       │   ├── footer.html
│   │       │   ├── get_started/
│   │       │   │   ├── cloud/
│   │       │   │   │   ├── cpu.md
│   │       │   │   │   └── gpu.md
│   │       │   │   ├── devices/
│   │       │   │   │   ├── nvidia-jetson.md
│   │       │   │   │   └── raspberry_pi.md
│   │       │   │   ├── get_started.html
│   │       │   │   ├── gpu_snippet.md
│   │       │   │   ├── linux/
│   │       │   │   │   ├── clojure/
│   │       │   │   │   │   └── build-from-source.md
│   │       │   │   │   ├── cpp/
│   │       │   │   │   │   └── build-from-source.md
│   │       │   │   │   ├── java/
│   │       │   │   │   │   └── build-from-source.md
│   │       │   │   │   ├── julia/
│   │       │   │   │   │   └── build-from-source.md
│   │       │   │   │   ├── perl/
│   │       │   │   │   │   └── build-from-source.md
│   │       │   │   │   ├── python/
│   │       │   │   │   │   ├── cpu/
│   │       │   │   │   │   │   ├── build-from-source.md
│   │       │   │   │   │   │   ├── docker.md
│   │       │   │   │   │   │   └── pip.md
│   │       │   │   │   │   └── gpu/
│   │       │   │   │   │       ├── build-from-source.md
│   │       │   │   │   │       ├── docker.md
│   │       │   │   │   │       └── pip.md
│   │       │   │   │   ├── r/
│   │       │   │   │   │   └── build-from-source.md
│   │       │   │   │   └── scala/
│   │       │   │   │       └── build-from-source.md
│   │       │   │   └── pip_snippet.md
│   │       │   ├── head.html
│   │       │   ├── header.html
│   │       │   ├── icon-github.html
│   │       │   ├── icon-twitter.html
│   │       │   ├── important.html
│   │       │   ├── matomo-analytics.html
│   │       │   ├── note.html
│   │       │   ├── social.html
│   │       │   ├── tip.html
│   │       │   └── warning.html
│   │       ├── _layouts/
│   │       │   ├── default.html
│   │       │   ├── home.html
│   │       │   ├── page.html
│   │       │   ├── page_api.html
│   │       │   ├── page_category.html
│   │       │   ├── page_landing_tutorials.html
│   │       │   └── post.html
│   │       ├── _plugins/
│   │       │   └── markdowner.rb
│   │       ├── _sass/
│   │       │   ├── feedback.scss
│   │       │   ├── generalVersionDropdown.scss
│   │       │   ├── globalSearch.scss
│   │       │   ├── minima/
│   │       │   │   ├── _base.scss
│   │       │   │   ├── _blog.scss
│   │       │   │   ├── _docs.scss
│   │       │   │   ├── _ecosystem.scss
│   │       │   │   ├── _features.scss
│   │       │   │   ├── _getting_started.scss
│   │       │   │   ├── _home.scss
│   │       │   │   ├── _layout.scss
│   │       │   │   ├── _syntax-highlighting.scss
│   │       │   │   ├── colorful.scss
│   │       │   │   └── simple-grid.scss
│   │       │   └── minima.scss
│   │       ├── assets/
│   │       │   ├── js/
│   │       │   │   ├── clipboard.js
│   │       │   │   ├── copycode.js
│   │       │   │   ├── feedback.js
│   │       │   │   ├── globalSearch.js
│   │       │   │   └── options.js
│   │       │   └── main.scss
│   │       ├── index.html
│   │       └── pages/
│   │           ├── api/
│   │           │   ├── api.html
│   │           │   ├── architecture/
│   │           │   │   ├── exception_handling.md
│   │           │   │   ├── note_data_loading.md
│   │           │   │   ├── note_engine.md
│   │           │   │   ├── note_memory.md
│   │           │   │   ├── overview.md
│   │           │   │   └── program_model.md
│   │           │   ├── clojure/
│   │           │   │   ├── docs/
│   │           │   │   │   └── tutorials/
│   │           │   │   │       ├── index.md
│   │           │   │   │       ├── kvstore.md
│   │           │   │   │       ├── module.md
│   │           │   │   │       ├── ndarray.md
│   │           │   │   │       ├── symbol.md
│   │           │   │   │       └── symbol_in_pictures.md
│   │           │   │   └── index.md
│   │           │   ├── cpp/
│   │           │   │   ├── docs/
│   │           │   │   │   └── tutorials/
│   │           │   │   │       ├── basics.md
│   │           │   │   │       ├── index.md
│   │           │   │   │       ├── multi_threaded_inference.md
│   │           │   │   │       ├── mxnet_cpp_inference_tutorial.md
│   │           │   │   │       └── subgraphAPI.md
│   │           │   │   └── index.md
│   │           │   ├── developer_guide/
│   │           │   │   ├── 1_github_contribution_and_PR_verification_tips.md
│   │           │   │   ├── debugging_and_performance_optimization_tips.md
│   │           │   │   ├── examine_forward_results_with_hooks.md
│   │           │   │   ├── exception_handing_and_custom_error_types.md
│   │           │   │   └── profiling.md
│   │           │   ├── faq/
│   │           │   │   ├── add_op_in_backend.md
│   │           │   │   ├── cloud.md
│   │           │   │   ├── distributed_training.md
│   │           │   │   ├── env_var.md
│   │           │   │   ├── float16.md
│   │           │   │   ├── gradient_compression.md
│   │           │   │   ├── large_tensor_support.md
│   │           │   │   ├── model_parallel_lstm.md
│   │           │   │   ├── new_op.md
│   │           │   │   ├── perf.md
│   │           │   │   ├── recordio.md
│   │           │   │   ├── s3_integration.md
│   │           │   │   ├── security.md
│   │           │   │   ├── tensor_inspector_tutorial.md
│   │           │   │   ├── using_rtc.md
│   │           │   │   └── why_mxnet.md
│   │           │   ├── java/
│   │           │   │   ├── docs/
│   │           │   │   │   └── tutorials/
│   │           │   │   │       ├── index.md
│   │           │   │   │       └── ssd_inference.md
│   │           │   │   └── index.md
│   │           │   ├── julia/
│   │           │   │   └── index.md
│   │           │   ├── perl/
│   │           │   │   ├── docs/
│   │           │   │   │   └── tutorials/
│   │           │   │   │       ├── index.md
│   │           │   │   │       ├── io.md
│   │           │   │   │       ├── kvstore.md
│   │           │   │   │       ├── ndarray.md
│   │           │   │   │       └── symbol.md
│   │           │   │   └── index.md
│   │           │   ├── python/
│   │           │   │   └── index.md
│   │           │   ├── r/
│   │           │   │   ├── docs/
│   │           │   │   │   └── tutorials/
│   │           │   │   │       ├── char_rnn_model.md
│   │           │   │   │       ├── classify_real_image_with_pretrained_model.md
│   │           │   │   │       ├── custom_iterator.md
│   │           │   │   │       ├── index.md
│   │           │   │   │       ├── multi_dim_lstm.md
│   │           │   │   │       ├── ndarray.md
│   │           │   │   │       └── symbol.md
│   │           │   │   └── index.md
│   │           │   └── scala/
│   │           │       ├── docs/
│   │           │       │   └── tutorials/
│   │           │       │       ├── index.md
│   │           │       │       ├── infer.md
│   │           │       │       ├── io.md
│   │           │       │       ├── kvstore.md
│   │           │       │       ├── ndarray.md
│   │           │       │       ├── symbol.md
│   │           │       │       └── symbol_in_pictures.md
│   │           │       └── index.md
│   │           ├── community/
│   │           │   ├── clang_format_guide.md
│   │           │   ├── code_guide.md
│   │           │   ├── code_review.md
│   │           │   ├── committer_guide.md
│   │           │   ├── community.md
│   │           │   ├── document.md
│   │           │   ├── error_handling.md
│   │           │   ├── git_howto.md
│   │           │   ├── index.md
│   │           │   └── pull_request.md
│   │           ├── ecosystem.html
│   │           ├── features.html
│   │           ├── get_started/
│   │           │   ├── build_from_source.md
│   │           │   ├── download.md
│   │           │   ├── index.html
│   │           │   ├── jetson_setup.md
│   │           │   └── validate_mxnet.md
│   │           └── trusted_by.html
│   └── tutorial_utils/
│       └── vision/
│           └── cnn_visualization/
│               └── gradcam.py
├── example/
│   ├── MXNetTutorialTemplate.ipynb
│   ├── README.md
│   ├── adversary/
│   │   ├── README.md
│   │   └── adversary_generation.ipynb
│   ├── bi-lstm-sort/
│   │   ├── README.md
│   │   └── bi-lstm-sort.ipynb
│   ├── distributed_training/
│   │   ├── README.md
│   │   ├── cifar10_dist.py
│   │   └── cifar10_kvstore_hvd.py
│   ├── distributed_training-horovod/
│   │   ├── README.md
│   │   ├── gluon_mnist.py
│   │   └── resnet50_imagenet.py
│   ├── extensions/
│   │   ├── lib_api/
│   │   │   ├── Makefile
│   │   │   ├── init_lib.cc
│   │   │   ├── libtest.cc
│   │   │   └── test_loading.py
│   │   ├── lib_custom_op/
│   │   │   ├── Makefile
│   │   │   ├── README.md
│   │   │   ├── gemm_lib.cc
│   │   │   ├── relu_lib.cc
│   │   │   ├── relu_lib.cu
│   │   │   ├── relu_lib.h
│   │   │   ├── test_gemm.py
│   │   │   ├── test_relu.py
│   │   │   ├── test_transposecsr.py
│   │   │   ├── test_transposerowsp.py
│   │   │   ├── transposecsr_lib.cc
│   │   │   └── transposerowsp_lib.cc
│   │   ├── lib_external_ops/
│   │   │   ├── CMakeLists.txt
│   │   │   ├── README.md
│   │   │   ├── init_lib.cc
│   │   │   ├── min_ex-inl.h
│   │   │   ├── min_ex.cc
│   │   │   ├── min_ex.cu
│   │   │   └── test_loading.py
│   │   ├── lib_pass/
│   │   │   ├── Makefile
│   │   │   ├── README.md
│   │   │   ├── pass_lib.cc
│   │   │   └── test_pass.py
│   │   └── lib_subgraph/
│   │       ├── Makefile
│   │       ├── README.md
│   │       ├── subgraph_lib.cc
│   │       └── test_subgraph.py
│   ├── gluon/
│   │   ├── actor_critic/
│   │   │   ├── README.md
│   │   │   └── actor_critic.py
│   │   ├── data.py
│   │   ├── house_prices/
│   │   │   ├── README.md
│   │   │   └── kaggle_k_fold_cross_validation.py
│   │   ├── image_classification.py
│   │   ├── mnist/
│   │   │   ├── README.md
│   │   │   └── mnist.py
│   │   └── super_resolution/
│   │       ├── README.md
│   │       └── super_resolution.py
│   ├── multi-task/
│   │   ├── README.md
│   │   └── multi-task-learning.ipynb
│   ├── probability/
│   │   └── VAE/
│   │       └── VAE.md
│   ├── profiler/
│   │   ├── README.md
│   │   ├── profiler_imageiter.py
│   │   ├── profiler_matmul.py
│   │   └── profiler_ndarray.py
│   ├── quantization/
│   │   ├── README.md
│   │   ├── imagenet_gen_qsym_onednn.py
│   │   ├── imagenet_inference.py
│   │   └── launch_inference_onednn.sh
│   ├── quantization_inc/
│   │   ├── custom_strategy.py
│   │   ├── resnet50v2_mse.yaml
│   │   ├── resnet_measurement.py
│   │   ├── resnet_mse.py
│   │   └── resnet_tuning.py
│   └── recommenders/
│       ├── .gitignore
│       ├── README.md
│       ├── demo1-MF.ipynb
│       ├── demo2-dssm.ipynb
│       ├── matrix_fact.py
│       └── movielens_data.py
├── include/
│   └── mxnet/
│       ├── api_registry.h
│       ├── base.h
│       ├── c_api.h
│       ├── c_api_error.h
│       ├── c_api_test.h
│       ├── engine.h
│       ├── executor.h
│       ├── expr_operator.h
│       ├── graph_attr_types.h
│       ├── imperative.h
│       ├── io.h
│       ├── ir/
│       │   └── expr.h
│       ├── kvstore.h
│       ├── lib_api.h
│       ├── libinfo.h
│       ├── ndarray.h
│       ├── node/
│       │   ├── container.h
│       │   └── node.h
│       ├── op_attr_types.h
│       ├── operator.h
│       ├── operator_util.h
│       ├── random_generator.h
│       ├── resource.h
│       ├── rtc.h
│       ├── runtime/
│       │   ├── c_runtime_api.h
│       │   ├── container.h
│       │   ├── container_ext.h
│       │   ├── data_type.h
│       │   ├── ffi_helper.h
│       │   ├── memory.h
│       │   ├── ndarray.h
│       │   ├── ndarray_handle.h
│       │   ├── object.h
│       │   ├── packed_func.h
│       │   ├── py_arg.h
│       │   └── registry.h
│       ├── storage.h
│       ├── tensor_blob.h
│       └── tuple.h
├── licenses/
│   ├── BOOST1_0
│   ├── BSD2
│   ├── BSD3-cmake
│   ├── MIT
│   └── OFL1_1
├── plugin/
│   ├── opencv/
│   │   ├── __init__.py
│   │   ├── cv_api.cc
│   │   ├── cv_api.h
│   │   ├── opencv.mk
│   │   └── opencv.py
│   ├── sframe/
│   │   ├── iter_sframe.cc
│   │   └── plugin.mk
│   ├── torch/
│   │   ├── torch.mk
│   │   ├── torch_base.cc
│   │   ├── torch_base.h
│   │   ├── torch_criterion-inl.h
│   │   ├── torch_criterion.cc
│   │   ├── torch_criterion.cu
│   │   ├── torch_function.cc
│   │   ├── torch_function.h
│   │   ├── torch_module-inl.h
│   │   ├── torch_module.cc
│   │   └── torch_module.cu
│   └── warpctc/
│       ├── warpctc-inl.h
│       ├── warpctc.cc
│       ├── warpctc.cu
│       └── warpctc.mk
├── prospector.yaml
├── pytest.ini
├── python/
│   ├── .gitignore
│   ├── README.md
│   ├── mxnet/
│   │   ├── __init__.py
│   │   ├── _api_internal.py
│   │   ├── _ctypes/
│   │   │   ├── __init__.py
│   │   │   ├── _api_internal.py
│   │   │   ├── cached_op.py
│   │   │   ├── ndarray.py
│   │   │   ├── space.py
│   │   │   └── symbol.py
│   │   ├── _cy3/
│   │   │   ├── README.md
│   │   │   └── __init__.py
│   │   ├── _deferred_compute.py
│   │   ├── _ffi/
│   │   │   ├── __init__.py
│   │   │   ├── _ctypes/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── function.py
│   │   │   │   ├── object.py
│   │   │   │   └── types.py
│   │   │   ├── _cy3/
│   │   │   │   └── __init__.py
│   │   │   ├── _cython/
│   │   │   │   ├── base.pxi
│   │   │   │   ├── core.pyx
│   │   │   │   ├── function.pxi
│   │   │   │   ├── ndarray.pxi
│   │   │   │   └── object.pxi
│   │   │   ├── base.py
│   │   │   ├── function.py
│   │   │   ├── node_generic.py
│   │   │   ├── object.py
│   │   │   └── runtime_ctypes.py
│   │   ├── _global_var.py
│   │   ├── _numpy_op_doc.py
│   │   ├── amp/
│   │   │   ├── __init__.py
│   │   │   ├── amp.py
│   │   │   ├── lists/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── symbol_bf16.py
│   │   │   │   └── symbol_fp16.py
│   │   │   └── loss_scaler.py
│   │   ├── api.py
│   │   ├── attribute.py
│   │   ├── autograd.py
│   │   ├── base.py
│   │   ├── callback.py
│   │   ├── container.py
│   │   ├── context.py
│   │   ├── contrib/
│   │   │   ├── __init__.py
│   │   │   ├── io.py
│   │   │   ├── ndarray.py
│   │   │   ├── onnx/
│   │   │   │   └── __init__.py
│   │   │   ├── quantization.py
│   │   │   ├── symbol.py
│   │   │   ├── tensorboard.py
│   │   │   ├── tensorrt.py
│   │   │   └── text/
│   │   │       ├── __init__.py
│   │   │       ├── _constants.py
│   │   │       ├── embedding.py
│   │   │       ├── utils.py
│   │   │       └── vocab.py
│   │   ├── cuda/
│   │   │   ├── __init__.py
│   │   │   └── nvtx.py
│   │   ├── cython/
│   │   │   ├── __init__.py
│   │   │   ├── base.pyi
│   │   │   ├── ndarray.pyx
│   │   │   └── symbol.pyx
│   │   ├── device.py
│   │   ├── dlpack.py
│   │   ├── engine.py
│   │   ├── error.py
│   │   ├── executor.py
│   │   ├── gluon/
│   │   │   ├── .gitignore
│   │   │   ├── __init__.py
│   │   │   ├── block.py
│   │   │   ├── contrib/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── data/
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── _constants.py
│   │   │   │   │   └── vision/
│   │   │   │   │       ├── __init__.py
│   │   │   │   │       ├── dataloader.py
│   │   │   │   │       └── transforms/
│   │   │   │   │           ├── __init__.py
│   │   │   │   │           └── bbox/
│   │   │   │   │               ├── __init__.py
│   │   │   │   │               ├── bbox.py
│   │   │   │   │               └── utils.py
│   │   │   │   └── estimator/
│   │   │   │       ├── __init__.py
│   │   │   │       ├── batch_processor.py
│   │   │   │       ├── estimator.py
│   │   │   │       ├── event_handler.py
│   │   │   │       └── utils.py
│   │   │   ├── data/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── _internal.py
│   │   │   │   ├── batchify.py
│   │   │   │   ├── dataloader.py
│   │   │   │   ├── dataset.py
│   │   │   │   ├── sampler.py
│   │   │   │   └── vision/
│   │   │   │       ├── __init__.py
│   │   │   │       ├── datasets.py
│   │   │   │       └── transforms/
│   │   │   │           ├── __init__.py
│   │   │   │           └── image.py
│   │   │   ├── loss.py
│   │   │   ├── metric.py
│   │   │   ├── model_zoo/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── model_store.py
│   │   │   │   └── vision/
│   │   │   │       ├── __init__.py
│   │   │   │       ├── alexnet.py
│   │   │   │       ├── densenet.py
│   │   │   │       ├── inception.py
│   │   │   │       ├── mobilenet.py
│   │   │   │       ├── resnet.py
│   │   │   │       ├── squeezenet.py
│   │   │   │       └── vgg.py
│   │   │   ├── nn/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── activations.py
│   │   │   │   ├── basic_layers.py
│   │   │   │   └── conv_layers.py
│   │   │   ├── parameter.py
│   │   │   ├── probability/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── block/
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   └── stochastic_block.py
│   │   │   │   ├── distributions/
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── bernoulli.py
│   │   │   │   │   ├── beta.py
│   │   │   │   │   ├── binomial.py
│   │   │   │   │   ├── categorical.py
│   │   │   │   │   ├── cauchy.py
│   │   │   │   │   ├── chi2.py
│   │   │   │   │   ├── constraint.py
│   │   │   │   │   ├── dirichlet.py
│   │   │   │   │   ├── distribution.py
│   │   │   │   │   ├── divergence.py
│   │   │   │   │   ├── exp_family.py
│   │   │   │   │   ├── exponential.py
│   │   │   │   │   ├── fishersnedecor.py
│   │   │   │   │   ├── gamma.py
│   │   │   │   │   ├── geometric.py
│   │   │   │   │   ├── gumbel.py
│   │   │   │   │   ├── half_cauchy.py
│   │   │   │   │   ├── half_normal.py
│   │   │   │   │   ├── independent.py
│   │   │   │   │   ├── laplace.py
│   │   │   │   │   ├── multinomial.py
│   │   │   │   │   ├── multivariate_normal.py
│   │   │   │   │   ├── negative_binomial.py
│   │   │   │   │   ├── normal.py
│   │   │   │   │   ├── one_hot_categorical.py
│   │   │   │   │   ├── pareto.py
│   │   │   │   │   ├── poisson.py
│   │   │   │   │   ├── relaxed_bernoulli.py
│   │   │   │   │   ├── relaxed_one_hot_categorical.py
│   │   │   │   │   ├── studentT.py
│   │   │   │   │   ├── transformed_distribution.py
│   │   │   │   │   ├── uniform.py
│   │   │   │   │   ├── utils.py
│   │   │   │   │   └── weibull.py
│   │   │   │   └── transformation/
│   │   │   │       ├── __init__.py
│   │   │   │       ├── domain_map.py
│   │   │   │       └── transformation.py
│   │   │   ├── rnn/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── conv_rnn_cell.py
│   │   │   │   ├── rnn_cell.py
│   │   │   │   └── rnn_layer.py
│   │   │   ├── trainer.py
│   │   │   └── utils.py
│   │   ├── image/
│   │   │   ├── __init__.py
│   │   │   ├── detection.py
│   │   │   └── image.py
│   │   ├── initializer.py
│   │   ├── io/
│   │   │   ├── __init__.py
│   │   │   ├── io.py
│   │   │   └── utils.py
│   │   ├── kvstore/
│   │   │   ├── __init__.py
│   │   │   ├── base.py
│   │   │   ├── byteps.py
│   │   │   ├── horovod.py
│   │   │   ├── kvstore.py
│   │   │   └── kvstore_server.py
│   │   ├── libinfo.py
│   │   ├── library.py
│   │   ├── log.py
│   │   ├── lr_scheduler.py
│   │   ├── misc.py
│   │   ├── model.py
│   │   ├── name.py
│   │   ├── ndarray/
│   │   │   ├── __init__.py
│   │   │   ├── _internal.py
│   │   │   ├── contrib.py
│   │   │   ├── image.py
│   │   │   ├── linalg.py
│   │   │   ├── ndarray.py
│   │   │   ├── numpy/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── _api_internal.py
│   │   │   │   ├── _internal.py
│   │   │   │   ├── _op.py
│   │   │   │   ├── _register.py
│   │   │   │   ├── linalg.py
│   │   │   │   └── random.py
│   │   │   ├── numpy_extension/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── _api_internal.py
│   │   │   │   ├── _op.py
│   │   │   │   ├── _register.py
│   │   │   │   ├── control_flow.py
│   │   │   │   ├── image.py
│   │   │   │   └── random.py
│   │   │   ├── op.py
│   │   │   ├── random.py
│   │   │   ├── register.py
│   │   │   ├── sparse.py
│   │   │   └── utils.py
│   │   ├── ndarray_doc.py
│   │   ├── notebook/
│   │   │   ├── __init__.py
│   │   │   └── callback.py
│   │   ├── numpy/
│   │   │   ├── __init__.py
│   │   │   ├── _op.py
│   │   │   ├── _register.py
│   │   │   ├── arrayprint.py
│   │   │   ├── fallback.py
│   │   │   ├── fallback_linalg.py
│   │   │   ├── function_base.py
│   │   │   ├── io.py
│   │   │   ├── linalg.py
│   │   │   ├── multiarray.py
│   │   │   ├── random.py
│   │   │   ├── set_functions.py
│   │   │   ├── stride_tricks.py
│   │   │   ├── type_functions.py
│   │   │   └── utils.py
│   │   ├── numpy_dispatch_protocol.py
│   │   ├── numpy_extension/
│   │   │   ├── __init__.py
│   │   │   ├── _op.py
│   │   │   ├── _register.py
│   │   │   ├── control_flow.py
│   │   │   ├── image.py
│   │   │   ├── random.py
│   │   │   └── utils.py
│   │   ├── numpy_op_fallback.py
│   │   ├── numpy_op_signature.py
│   │   ├── onnx/
│   │   │   ├── README.md
│   │   │   ├── __init__.py
│   │   │   ├── mx2onnx/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── _export_helper.py
│   │   │   │   ├── _export_model.py
│   │   │   │   ├── _export_onnx.py
│   │   │   │   └── _op_translations/
│   │   │   │       ├── __init__.py
│   │   │   │       ├── _op_translations_opset12.py
│   │   │   │       └── _op_translations_opset13.py
│   │   │   └── setup.py
│   │   ├── operator.py
│   │   ├── optimizer/
│   │   │   ├── __init__.py
│   │   │   ├── adabelief.py
│   │   │   ├── adadelta.py
│   │   │   ├── adagrad.py
│   │   │   ├── adam.py
│   │   │   ├── adamW.py
│   │   │   ├── adamax.py
│   │   │   ├── contrib.py
│   │   │   ├── dcasgd.py
│   │   │   ├── ftml.py
│   │   │   ├── ftrl.py
│   │   │   ├── lamb.py
│   │   │   ├── lans.py
│   │   │   ├── lars.py
│   │   │   ├── nadam.py
│   │   │   ├── nag.py
│   │   │   ├── optimizer.py
│   │   │   ├── rmsprop.py
│   │   │   ├── sgd.py
│   │   │   ├── sgld.py
│   │   │   ├── signum.py
│   │   │   ├── updater.py
│   │   │   └── utils.py
│   │   ├── profiler.py
│   │   ├── random.py
│   │   ├── recordio.py
│   │   ├── registry.py
│   │   ├── rtc.py
│   │   ├── runtime.py
│   │   ├── symbol/
│   │   │   ├── __init__.py
│   │   │   ├── _internal.py
│   │   │   ├── contrib.py
│   │   │   ├── image.py
│   │   │   ├── linalg.py
│   │   │   ├── numpy/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── _internal.py
│   │   │   │   ├── _op.py
│   │   │   │   ├── _register.py
│   │   │   │   ├── _symbol.py
│   │   │   │   ├── linalg.py
│   │   │   │   └── random.py
│   │   │   ├── numpy_extension/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── _op.py
│   │   │   │   ├── _register.py
│   │   │   │   ├── image.py
│   │   │   │   └── random.py
│   │   │   ├── op.py
│   │   │   ├── random.py
│   │   │   ├── register.py
│   │   │   ├── sparse.py
│   │   │   └── symbol.py
│   │   ├── symbol_doc.py
│   │   ├── test_utils.py
│   │   ├── tvmop.py
│   │   ├── util.py
│   │   └── visualization.py
│   └── setup.py
├── rat-excludes
├── readthedocs.yml
├── snap.python
├── src/
│   ├── api/
│   │   ├── _api_internal/
│   │   │   └── _api_internal.cc
│   │   ├── cached_op_api.cc
│   │   └── operator/
│   │       ├── numpy/
│   │       │   ├── linalg/
│   │       │   │   ├── np_det.cc
│   │       │   │   ├── np_eig.cc
│   │       │   │   ├── np_eigvals.cc
│   │       │   │   ├── np_gesvd.cc
│   │       │   │   ├── np_inv.cc
│   │       │   │   ├── np_lstsq.cc
│   │       │   │   ├── np_matrix_rank.cc
│   │       │   │   ├── np_norm.cc
│   │       │   │   ├── np_pinv.cc
│   │       │   │   ├── np_potrf.cc
│   │       │   │   ├── np_qr.cc
│   │       │   │   ├── np_slogdet.cc
│   │       │   │   ├── np_solve.cc
│   │       │   │   ├── np_tensorinv.cc
│   │       │   │   └── np_tensorsolve.cc
│   │       │   ├── np_bincount_op.cc
│   │       │   ├── np_broadcast_reduce_op_boolean.cc
│   │       │   ├── np_broadcast_reduce_op_index.cc
│   │       │   ├── np_broadcast_reduce_op_value.cc
│   │       │   ├── np_cross.cc
│   │       │   ├── np_cumsum.cc
│   │       │   ├── np_delete_op.cc
│   │       │   ├── np_diff_op.cc
│   │       │   ├── np_dot_op.cc
│   │       │   ├── np_ediff1d_op.cc
│   │       │   ├── np_einsum_op.cc
│   │       │   ├── np_elemwise_broadcast_logic_op.cc
│   │       │   ├── np_elemwise_broadcast_op.cc
│   │       │   ├── np_elemwise_broadcast_op_extended_sec.cc
│   │       │   ├── np_elemwise_unary_op_basic.cc
│   │       │   ├── np_fill_diagonal_op.cc
│   │       │   ├── np_histogram_op.cc
│   │       │   ├── np_init_op.cc
│   │       │   ├── np_insert_op.cc
│   │       │   ├── np_interp_op.cc
│   │       │   ├── np_kron.cc
│   │       │   ├── np_matmul_op.cc
│   │       │   ├── np_matrix_op.cc
│   │       │   ├── np_memory_op.cc
│   │       │   ├── np_moments_op.cc
│   │       │   ├── np_nan_to_num_op.cc
│   │       │   ├── np_nonzero_op.cc
│   │       │   ├── np_ordering_op.cc
│   │       │   ├── np_pad_op.cc
│   │       │   ├── np_percentile_op.cc
│   │       │   ├── np_polynomial_op.cc
│   │       │   ├── np_repeat_op.cc
│   │       │   ├── np_tensordot_op.cc
│   │       │   ├── np_trace_op.cc
│   │       │   ├── np_tri_op.cc
│   │       │   ├── np_tril_op.cc
│   │       │   ├── np_triu_op.cc
│   │       │   ├── np_unique_op.cc
│   │       │   ├── np_where_op.cc
│   │       │   ├── np_window_op.cc
│   │       │   └── random/
│   │       │       ├── np_choice_op.cc
│   │       │       ├── np_exponential_op.cc
│   │       │       ├── np_laplace_op.cc
│   │       │       ├── np_location_scale_op.cc
│   │       │       ├── np_multinomial_op.cc
│   │       │       ├── np_pareto_op.cc
│   │       │       ├── np_power_op.cc
│   │       │       ├── np_rayleigh_op.cc
│   │       │       └── np_weibull_op.cc
│   │       ├── numpy_extension/
│   │       │   ├── npx_activation_op.cc
│   │       │   ├── npx_arange_like_op.cc
│   │       │   ├── npx_batch_dot_op.cc
│   │       │   ├── npx_batch_norm_op.cc
│   │       │   ├── npx_broadcast_like_op.cc
│   │       │   ├── npx_control_flow_op.cc
│   │       │   ├── npx_convolution_op.cc
│   │       │   ├── npx_deconvolution_op.cc
│   │       │   ├── npx_dropout_op.cc
│   │       │   ├── npx_embedding_op.cc
│   │       │   ├── npx_fully_connected_op.cc
│   │       │   ├── npx_group_norm_op.cc
│   │       │   ├── npx_layer_norm_op.cc
│   │       │   ├── npx_leaky_relu_op.cc
│   │       │   ├── npx_one_hot_op.cc
│   │       │   ├── npx_pick_op.cc
│   │       │   ├── npx_pooling_op.cc
│   │       │   ├── npx_rnn_op.cc
│   │       │   ├── npx_softmax_op.cc
│   │       │   └── npx_topk_op.cc
│   │       ├── op_utils.cc
│   │       ├── op_utils.h
│   │       ├── random/
│   │       │   ├── np_gamma_op.cc
│   │       │   ├── np_normal_op.cc
│   │       │   ├── np_randint_op.cc
│   │       │   ├── np_uniform_op.cc
│   │       │   └── shuffle_op.cc
│   │       ├── tensor/
│   │       │   ├── elemwise_binary_broadcast_op_extended.cc
│   │       │   ├── indexing_op.cc
│   │       │   ├── matrix_op.cc
│   │       │   └── unravel.cc
│   │       ├── ufunc_helper.cc
│   │       ├── ufunc_helper.h
│   │       ├── utils.cc
│   │       └── utils.h
│   ├── base.cc
│   ├── c_api/
│   │   ├── .clang-tidy
│   │   ├── c_api.cc
│   │   ├── c_api_common.h
│   │   ├── c_api_function.cc
│   │   ├── c_api_ndarray.cc
│   │   ├── c_api_profile.cc
│   │   ├── c_api_symbolic.cc
│   │   └── c_api_test.cc
│   ├── common/
│   │   ├── alm.cc
│   │   ├── alm.h
│   │   ├── cuda/
│   │   │   ├── cudnn_cxx.cc
│   │   │   ├── cudnn_cxx.h
│   │   │   ├── nvtx.h
│   │   │   ├── rtc/
│   │   │   │   ├── backward_functions-inl.h
│   │   │   │   ├── forward_functions-inl.h
│   │   │   │   ├── half-inl.h
│   │   │   │   ├── reducer-inl.h
│   │   │   │   ├── special_functions-inl.h
│   │   │   │   ├── util-inl.h
│   │   │   │   └── vectorization-inl.h
│   │   │   ├── rtc.cc
│   │   │   ├── rtc.h
│   │   │   ├── utils.cc
│   │   │   └── utils.h
│   │   ├── exec_utils.cc
│   │   ├── exec_utils.h
│   │   ├── lazy_alloc_array.h
│   │   ├── object_pool.h
│   │   ├── random_generator.cu
│   │   ├── rtc.cc
│   │   ├── static_array.h
│   │   ├── tensor_inspector.h
│   │   ├── utils.cc
│   │   ├── utils.cu
│   │   └── utils.h
│   ├── engine/
│   │   ├── engine.cc
│   │   ├── engine_impl.h
│   │   ├── naive_engine.cc
│   │   ├── openmp.cc
│   │   ├── openmp.h
│   │   ├── stream_manager.h
│   │   ├── thread_pool.h
│   │   ├── threaded_engine.cc
│   │   ├── threaded_engine.h
│   │   ├── threaded_engine_perdevice.cc
│   │   └── threaded_engine_pooled.cc
│   ├── imperative/
│   │   ├── attach_op_execs_pass.cc
│   │   ├── attach_op_resource_pass.cc
│   │   ├── cached_op.cc
│   │   ├── cached_op.h
│   │   ├── cached_op_threadsafe.cc
│   │   ├── cached_op_threadsafe.h
│   │   ├── cuda_graphs.h
│   │   ├── eliminate_common_expr_pass.cc
│   │   ├── exec_pass.h
│   │   ├── imperative.cc
│   │   ├── imperative_utils.cc
│   │   ├── imperative_utils.h
│   │   ├── infer_graph_attr_pass.cc
│   │   ├── inplace_addto_detect_pass.cc
│   │   ├── naive_cached_op.cc
│   │   ├── naive_cached_op.h
│   │   ├── pointwise_fusion_pass.cc
│   │   ├── simple_partition_pass.cc
│   │   └── simple_partition_pass.h
│   ├── initialize.cc
│   ├── initialize.h
│   ├── io/
│   │   ├── batchify.cc
│   │   ├── dataloader.cc
│   │   ├── dataset.cc
│   │   ├── image_aug_default.cc
│   │   ├── image_augmenter.h
│   │   ├── image_det_aug_default.cc
│   │   ├── image_io.cc
│   │   ├── image_iter_common.h
│   │   ├── image_recordio.h
│   │   ├── inst_vector.h
│   │   ├── io.cc
│   │   ├── iter_batchloader.h
│   │   ├── iter_csv.cc
│   │   ├── iter_image_det_recordio.cc
│   │   ├── iter_image_recordio.cc
│   │   ├── iter_image_recordio_2.cc
│   │   ├── iter_libsvm.cc
│   │   ├── iter_mnist.cc
│   │   ├── iter_normalize.h
│   │   ├── iter_prefetcher.h
│   │   ├── iter_sampler.cc
│   │   ├── iter_sparse.h
│   │   ├── iter_sparse_batchloader.h
│   │   ├── iter_sparse_prefetcher.h
│   │   └── opencv_compatibility.h
│   ├── ir/
│   │   └── expr.cc
│   ├── kvstore/
│   │   ├── comm.h
│   │   ├── comm_tree.h
│   │   ├── gpu_topology.h
│   │   ├── gradient_compression-inl.h
│   │   ├── gradient_compression.cc
│   │   ├── gradient_compression.cu
│   │   ├── gradient_compression.h
│   │   ├── kvstore.cc
│   │   ├── kvstore_dist.h
│   │   ├── kvstore_dist_server.h
│   │   ├── kvstore_local.h
│   │   ├── kvstore_nccl.h
│   │   ├── kvstore_utils.cc
│   │   ├── kvstore_utils.cu
│   │   ├── kvstore_utils.h
│   │   └── p3store_dist.h
│   ├── lang/
│   │   ├── expr.cc
│   │   └── ir.cc
│   ├── lib_api.cc
│   ├── libinfo.cc
│   ├── ndarray/
│   │   ├── ndarray.cc
│   │   ├── ndarray_function-inl.cuh
│   │   ├── ndarray_function-inl.h
│   │   ├── ndarray_function.cc
│   │   ├── ndarray_function.cu
│   │   └── ndarray_function.h
│   ├── nnvm/
│   │   ├── error.h
│   │   ├── gradient.cc
│   │   ├── graph_algorithm.h
│   │   ├── graph_editor.cc
│   │   ├── legacy_json_util.cc
│   │   ├── legacy_op_util.cc
│   │   ├── low_precision_pass.cc
│   │   ├── node_op_util.h
│   │   ├── plan_memory.cc
│   │   └── tvm_bridge.cc
│   ├── operator/
│   │   ├── all_finite-inl.h
│   │   ├── all_finite.cc
│   │   ├── all_finite.cu
│   │   ├── amp_graph_pass.cc
│   │   ├── bilinear_sampler-inl.h
│   │   ├── bilinear_sampler.cc
│   │   ├── bilinear_sampler.cu
│   │   ├── c_lapack_api.cc
│   │   ├── c_lapack_api.h
│   │   ├── channel_op_common.h
│   │   ├── contrib/
│   │   │   ├── adabelief-inl.h
│   │   │   ├── adabelief.cc
│   │   │   ├── adabelief.cu
│   │   │   ├── adamw-inl.h
│   │   │   ├── adamw.cc
│   │   │   ├── adamw.cu
│   │   │   ├── adaptive_avg_pooling-inl.h
│   │   │   ├── adaptive_avg_pooling.cc
│   │   │   ├── adaptive_avg_pooling.cu
│   │   │   ├── allclose_op-inl.h
│   │   │   ├── allclose_op.cc
│   │   │   ├── allclose_op.cu
│   │   │   ├── bilinear_resize-inl.cuh
│   │   │   ├── bilinear_resize-inl.h
│   │   │   ├── bilinear_resize.cc
│   │   │   ├── bilinear_resize.cu
│   │   │   ├── boolean_mask-inl.h
│   │   │   ├── boolean_mask.cc
│   │   │   ├── boolean_mask.cu
│   │   │   ├── bounding_box-common.h
│   │   │   ├── bounding_box-inl.cuh
│   │   │   ├── bounding_box-inl.h
│   │   │   ├── bounding_box.cc
│   │   │   ├── bounding_box.cu
│   │   │   ├── count_sketch-inl.h
│   │   │   ├── count_sketch.cc
│   │   │   ├── count_sketch.cu
│   │   │   ├── deformable_psroi_pooling-inl.h
│   │   │   ├── deformable_psroi_pooling.cc
│   │   │   ├── deformable_psroi_pooling.cu
│   │   │   ├── dgl_graph-inl.h
│   │   │   ├── dgl_graph.cc
│   │   │   ├── dgl_graph.cu
│   │   │   ├── dynamic_shape_ops-inl.h
│   │   │   ├── dynamic_shape_ops.cc
│   │   │   ├── erfinv-inl.h
│   │   │   ├── fft-inl.h
│   │   │   ├── fft.cc
│   │   │   ├── fft.cu
│   │   │   ├── gradient_multiplier_op.cc
│   │   │   ├── gradient_multiplier_op.cu
│   │   │   ├── hawkes_ll-inl.h
│   │   │   ├── hawkes_ll.cc
│   │   │   ├── hawkes_ll.cu
│   │   │   ├── index_array-inl.h
│   │   │   ├── index_array.cc
│   │   │   ├── index_array.cu
│   │   │   ├── index_copy-inl.h
│   │   │   ├── index_copy.cc
│   │   │   ├── index_copy.cu
│   │   │   ├── intgemm/
│   │   │   │   ├── intgemm_fully_connected_op.cc
│   │   │   │   ├── max_absolute_op.cc
│   │   │   │   ├── prepare_data_op.cc
│   │   │   │   ├── prepare_weight_op.cc
│   │   │   │   └── take_weight_op.cc
│   │   │   ├── krprod.cc
│   │   │   ├── krprod.h
│   │   │   ├── mrcnn_mask_target-inl.h
│   │   │   ├── mrcnn_mask_target.cu
│   │   │   ├── multi_lamb-inl.h
│   │   │   ├── multi_lamb.cc
│   │   │   ├── multi_lamb.cu
│   │   │   ├── multi_lans-inl.h
│   │   │   ├── multi_lans.cc
│   │   │   ├── multi_lans.cu
│   │   │   ├── multi_lars-inl.h
│   │   │   ├── multi_lars.cc
│   │   │   ├── multi_lars.cu
│   │   │   ├── multi_proposal-inl.h
│   │   │   ├── multi_proposal.cc
│   │   │   ├── multi_proposal.cu
│   │   │   ├── multi_sum_sq-inl.h
│   │   │   ├── multi_sum_sq.cc
│   │   │   ├── multi_sum_sq.cu
│   │   │   ├── multibox_detection-inl.h
│   │   │   ├── multibox_detection.cc
│   │   │   ├── multibox_detection.cu
│   │   │   ├── multibox_prior-inl.h
│   │   │   ├── multibox_prior.cc
│   │   │   ├── multibox_prior.cu
│   │   │   ├── multibox_target-inl.h
│   │   │   ├── multibox_target.cc
│   │   │   ├── multibox_target.cu
│   │   │   ├── nn/
│   │   │   │   ├── deformable_im2col.cuh
│   │   │   │   ├── deformable_im2col.h
│   │   │   │   ├── modulated_deformable_im2col.cuh
│   │   │   │   └── modulated_deformable_im2col.h
│   │   │   ├── nnz.cc
│   │   │   ├── optimizer_op-inl.h
│   │   │   ├── optimizer_op.cc
│   │   │   ├── optimizer_op.cu
│   │   │   ├── preloaded_multi_sgd-inl.h
│   │   │   ├── preloaded_multi_sgd.cc
│   │   │   ├── preloaded_multi_sgd.cu
│   │   │   ├── proposal-inl.h
│   │   │   ├── proposal.cc
│   │   │   ├── proposal.cu
│   │   │   ├── psroi_pooling-inl.h
│   │   │   ├── psroi_pooling.cc
│   │   │   ├── psroi_pooling.cu
│   │   │   ├── quadratic_op-inl.h
│   │   │   ├── quadratic_op.cc
│   │   │   ├── quadratic_op.cu
│   │   │   ├── reset_arrays-inl.h
│   │   │   ├── reset_arrays.cc
│   │   │   ├── reset_arrays.cu
│   │   │   ├── roi_align-inl.h
│   │   │   ├── roi_align.cc
│   │   │   ├── roi_align.cu
│   │   │   ├── rroi_align-inl.h
│   │   │   ├── rroi_align.cc
│   │   │   ├── stes_op.cc
│   │   │   ├── stes_op.cu
│   │   │   ├── stes_op.h
│   │   │   ├── sync_batch_norm-inl.h
│   │   │   ├── sync_batch_norm.cc
│   │   │   ├── sync_batch_norm.cu
│   │   │   ├── transformer-inl.h
│   │   │   ├── transformer.cc
│   │   │   ├── transformer.cu
│   │   │   └── tvmop/
│   │   │       ├── dot.cc
│   │   │       └── ufunc.cc
│   │   ├── control_flow.cc
│   │   ├── correlation-inl.h
│   │   ├── correlation.cc
│   │   ├── correlation.cu
│   │   ├── crop-inl.h
│   │   ├── crop.cc
│   │   ├── crop.cu
│   │   ├── cross_device_copy.cc
│   │   ├── cudnn_bilinear_sampler-inl.h
│   │   ├── cudnn_lrn-inl.h
│   │   ├── cudnn_ops.cc
│   │   ├── cudnn_ops.h
│   │   ├── cudnn_spatial_transformer-inl.h
│   │   ├── custom/
│   │   │   ├── custom-inl.h
│   │   │   ├── custom.cc
│   │   │   ├── native_op-inl.h
│   │   │   ├── native_op.cc
│   │   │   ├── native_op.cu
│   │   │   ├── ndarray_op-inl.h
│   │   │   └── ndarray_op.cc
│   │   ├── deformable_convolution-inl.h
│   │   ├── deformable_convolution.cc
│   │   ├── deformable_convolution.cu
│   │   ├── elemwise_op_common.h
│   │   ├── fusion/
│   │   │   ├── fused_op-inl.h
│   │   │   ├── fused_op.cc
│   │   │   ├── fused_op.cu
│   │   │   └── fused_op.h
│   │   ├── grid_generator-inl.h
│   │   ├── grid_generator.cc
│   │   ├── grid_generator.cu
│   │   ├── identity_attach_KL_sparse_reg-inl.h
│   │   ├── identity_attach_KL_sparse_reg.cc
│   │   ├── identity_attach_KL_sparse_reg.cu
│   │   ├── image/
│   │   │   ├── crop-inl.h
│   │   │   ├── crop.cc
│   │   │   ├── crop.cu
│   │   │   ├── image_random-inl.h
│   │   │   ├── image_random.cc
│   │   │   ├── image_random.cu
│   │   │   ├── image_utils.h
│   │   │   ├── resize-inl.h
│   │   │   ├── resize.cc
│   │   │   └── resize.cu
│   │   ├── instance_norm-inl.h
│   │   ├── instance_norm.cc
│   │   ├── instance_norm.cu
│   │   ├── l2_normalization-inl.h
│   │   ├── l2_normalization.cc
│   │   ├── l2_normalization.cu
│   │   ├── leaky_relu-inl.h
│   │   ├── leaky_relu.cc
│   │   ├── leaky_relu.cu
│   │   ├── linalg.h
│   │   ├── linalg_impl.h
│   │   ├── loss_binary_op-inl.h
│   │   ├── loss_binary_op.cc
│   │   ├── loss_binary_op.cu
│   │   ├── make_loss-inl.h
│   │   ├── make_loss.cc
│   │   ├── make_loss.cu
│   │   ├── math_functions-inl.h
│   │   ├── mkl_functions-inl.h
│   │   ├── modulated_deformable_convolution-inl.h
│   │   ├── modulated_deformable_convolution.cc
│   │   ├── modulated_deformable_convolution.cu
│   │   ├── mshadow_op.h
│   │   ├── mxnet_op.h
│   │   ├── nn/
│   │   │   ├── activation-inl.h
│   │   │   ├── activation.cc
│   │   │   ├── activation.cu
│   │   │   ├── batch_norm-inl.h
│   │   │   ├── batch_norm.cc
│   │   │   ├── batch_norm.cu
│   │   │   ├── concat-inl.h
│   │   │   ├── concat.cc
│   │   │   ├── concat.cu
│   │   │   ├── convolution-inl.h
│   │   │   ├── convolution.cc
│   │   │   ├── convolution.cu
│   │   │   ├── ctc_loss-inl.h
│   │   │   ├── ctc_loss.cc
│   │   │   ├── ctc_loss.cu
│   │   │   ├── cudnn/
│   │   │   │   ├── cudnn_activation-inl.h
│   │   │   │   ├── cudnn_algoreg-inl.h
│   │   │   │   ├── cudnn_algoreg.cc
│   │   │   │   ├── cudnn_batch_norm.cu
│   │   │   │   ├── cudnn_batch_norm.h
│   │   │   │   ├── cudnn_convolution-inl.h
│   │   │   │   ├── cudnn_deconvolution-inl.h
│   │   │   │   ├── cudnn_pooling-inl.h
│   │   │   │   └── cudnn_softmax_activation-inl.h
│   │   │   ├── deconvolution-inl.h
│   │   │   ├── deconvolution.cc
│   │   │   ├── deconvolution.cu
│   │   │   ├── depthwise_convolution-inl.h
│   │   │   ├── depthwise_convolution_tf.cuh
│   │   │   ├── dnnl/
│   │   │   │   ├── dnnl_act-inl.h
│   │   │   │   ├── dnnl_act.cc
│   │   │   │   ├── dnnl_base-inl.h
│   │   │   │   ├── dnnl_base.cc
│   │   │   │   ├── dnnl_batch_dot-inl.h
│   │   │   │   ├── dnnl_batch_dot.cc
│   │   │   │   ├── dnnl_batch_norm-inl.h
│   │   │   │   ├── dnnl_batch_norm.cc
│   │   │   │   ├── dnnl_binary-inl.h
│   │   │   │   ├── dnnl_binary.cc
│   │   │   │   ├── dnnl_concat-inl.h
│   │   │   │   ├── dnnl_concat.cc
│   │   │   │   ├── dnnl_convolution-inl.h
│   │   │   │   ├── dnnl_convolution.cc
│   │   │   │   ├── dnnl_copy-inl.h
│   │   │   │   ├── dnnl_copy.cc
│   │   │   │   ├── dnnl_deconvolution-inl.h
│   │   │   │   ├── dnnl_deconvolution.cc
│   │   │   │   ├── dnnl_dot-inl.h
│   │   │   │   ├── dnnl_dot.cc
│   │   │   │   ├── dnnl_eltwise-inl.h
│   │   │   │   ├── dnnl_eltwise.cc
│   │   │   │   ├── dnnl_fully_connected-inl.h
│   │   │   │   ├── dnnl_fully_connected.cc
│   │   │   │   ├── dnnl_layer_norm-inl.h
│   │   │   │   ├── dnnl_layer_norm.cc
│   │   │   │   ├── dnnl_log_softmax.cc
│   │   │   │   ├── dnnl_lrn-inl.h
│   │   │   │   ├── dnnl_masked_softmax-inl.h
│   │   │   │   ├── dnnl_masked_softmax.cc
│   │   │   │   ├── dnnl_pooling-inl.h
│   │   │   │   ├── dnnl_pooling.cc
│   │   │   │   ├── dnnl_pow_mul_scalar-inl.h
│   │   │   │   ├── dnnl_pow_mul_scalar.cc
│   │   │   │   ├── dnnl_reduce-inl.h
│   │   │   │   ├── dnnl_reduce.cc
│   │   │   │   ├── dnnl_reshape-inl.h
│   │   │   │   ├── dnnl_reshape.cc
│   │   │   │   ├── dnnl_rnn-inl.h
│   │   │   │   ├── dnnl_rnn.cc
│   │   │   │   ├── dnnl_softmax-inl.h
│   │   │   │   ├── dnnl_softmax.cc
│   │   │   │   ├── dnnl_softmax_output-inl.h
│   │   │   │   ├── dnnl_softmax_output.cc
│   │   │   │   ├── dnnl_split-inl.h
│   │   │   │   ├── dnnl_split.cc
│   │   │   │   ├── dnnl_stack-inl.h
│   │   │   │   ├── dnnl_stack.cc
│   │   │   │   ├── dnnl_sum-inl.h
│   │   │   │   ├── dnnl_sum.cc
│   │   │   │   ├── dnnl_transpose-inl.h
│   │   │   │   ├── dnnl_transpose.cc
│   │   │   │   ├── dnnl_where-inl.h
│   │   │   │   └── dnnl_where.cc
│   │   │   ├── dropout-inl.h
│   │   │   ├── dropout.cc
│   │   │   ├── dropout.cu
│   │   │   ├── fully_connected-inl.h
│   │   │   ├── fully_connected.cc
│   │   │   ├── fully_connected.cu
│   │   │   ├── group_norm-inl.h
│   │   │   ├── group_norm.cc
│   │   │   ├── group_norm.cu
│   │   │   ├── im2col-inl.h
│   │   │   ├── im2col.cc
│   │   │   ├── im2col.cu
│   │   │   ├── im2col.cuh
│   │   │   ├── im2col.h
│   │   │   ├── layer_norm-inl.h
│   │   │   ├── layer_norm.cc
│   │   │   ├── layer_norm.cu
│   │   │   ├── layer_norm_cpu.h
│   │   │   ├── log_softmax.cc
│   │   │   ├── log_softmax.cu
│   │   │   ├── lrn-inl.h
│   │   │   ├── lrn.cc
│   │   │   ├── lrn.cu
│   │   │   ├── masked_softmax.cc
│   │   │   ├── moments-inl.h
│   │   │   ├── moments.cc
│   │   │   ├── moments.cu
│   │   │   ├── pool.cuh
│   │   │   ├── pool.h
│   │   │   ├── pool_utils.h
│   │   │   ├── pooling-inl.h
│   │   │   ├── pooling.cc
│   │   │   ├── pooling.cu
│   │   │   ├── sequence_mask-inl.h
│   │   │   ├── softmax-inl.h
│   │   │   ├── softmax.cc
│   │   │   ├── softmax.cu
│   │   │   ├── softmax_activation-inl.h
│   │   │   ├── softmax_activation.cc
│   │   │   ├── softmax_activation.cu
│   │   │   ├── softmin.cc
│   │   │   ├── softmin.cu
│   │   │   ├── upsampling-inl.h
│   │   │   ├── upsampling.cc
│   │   │   └── upsampling.cu
│   │   ├── npx_control_flow.cc
│   │   ├── npx_control_flow.h
│   │   ├── numpy/
│   │   │   ├── linalg/
│   │   │   │   ├── broadcast_reduce_customized-inl.h
│   │   │   │   ├── broadcast_reduce_op_customized.h
│   │   │   │   ├── np_eig-inl.h
│   │   │   │   ├── np_eig.cc
│   │   │   │   ├── np_eig.cu
│   │   │   │   ├── np_eigvals-inl.h
│   │   │   │   ├── np_eigvals.cc
│   │   │   │   ├── np_eigvals.cu
│   │   │   │   ├── np_gesvd-inl.h
│   │   │   │   ├── np_gesvd.cc
│   │   │   │   ├── np_gesvd.cu
│   │   │   │   ├── np_lstsq-inl.h
│   │   │   │   ├── np_lstsq.cc
│   │   │   │   ├── np_lstsq.cu
│   │   │   │   ├── np_matrix_rank-inl.h
│   │   │   │   ├── np_matrix_rank.cc
│   │   │   │   ├── np_matrix_rank.cu
│   │   │   │   ├── np_norm-inl.h
│   │   │   │   ├── np_norm.cc
│   │   │   │   ├── np_norm_backward.cc
│   │   │   │   ├── np_norm_backward.cu
│   │   │   │   ├── np_norm_forward.cc
│   │   │   │   ├── np_norm_forward.cu
│   │   │   │   ├── np_pinv-inl.h
│   │   │   │   ├── np_pinv.cc
│   │   │   │   ├── np_pinv.cu
│   │   │   │   ├── np_potrf-inl.h
│   │   │   │   ├── np_potrf.cc
│   │   │   │   ├── np_potrf.cu
│   │   │   │   ├── np_qr-inl.h
│   │   │   │   ├── np_qr.cc
│   │   │   │   ├── np_qr.cu
│   │   │   │   ├── np_solve-inl.h
│   │   │   │   ├── np_solve.cc
│   │   │   │   ├── np_solve.cu
│   │   │   │   ├── np_tensorinv-inl.h
│   │   │   │   ├── np_tensorinv.cc
│   │   │   │   ├── np_tensorinv.cu
│   │   │   │   ├── np_tensorsolve-inl.h
│   │   │   │   ├── np_tensorsolve.cc
│   │   │   │   └── np_tensorsolve.cu
│   │   │   ├── np_bincount_op-inl.h
│   │   │   ├── np_bincount_op.cc
│   │   │   ├── np_bincount_op.cu
│   │   │   ├── np_boolean_mask_assign.cc
│   │   │   ├── np_boolean_mask_assign.cu
│   │   │   ├── np_broadcast_reduce_op.cc
│   │   │   ├── np_broadcast_reduce_op.h
│   │   │   ├── np_broadcast_reduce_op_boolean.cc
│   │   │   ├── np_broadcast_reduce_op_boolean.cu
│   │   │   ├── np_broadcast_reduce_op_index.cc
│   │   │   ├── np_broadcast_reduce_op_index.cu
│   │   │   ├── np_broadcast_reduce_op_value.h
│   │   │   ├── np_broadcast_reduce_op_value_broadcast_to.cc
│   │   │   ├── np_broadcast_reduce_op_value_broadcast_to.cu
│   │   │   ├── np_broadcast_reduce_op_value_max.cc
│   │   │   ├── np_broadcast_reduce_op_value_max.cu
│   │   │   ├── np_broadcast_reduce_op_value_mean.cc
│   │   │   ├── np_broadcast_reduce_op_value_mean.cu
│   │   │   ├── np_broadcast_reduce_op_value_min.cc
│   │   │   ├── np_broadcast_reduce_op_value_min.cu
│   │   │   ├── np_broadcast_reduce_op_value_prod.cc
│   │   │   ├── np_broadcast_reduce_op_value_prod.cu
│   │   │   ├── np_broadcast_reduce_op_value_sum.cc
│   │   │   ├── np_broadcast_reduce_op_value_sum.cu
│   │   │   ├── np_constraint_check.cc
│   │   │   ├── np_constraint_check.cu
│   │   │   ├── np_constraint_check.h
│   │   │   ├── np_cross-inl.h
│   │   │   ├── np_cross.cc
│   │   │   ├── np_cross.cu
│   │   │   ├── np_cumsum-inl.h
│   │   │   ├── np_cumsum.cc
│   │   │   ├── np_cumsum.cu
│   │   │   ├── np_delete_op-inl.h
│   │   │   ├── np_delete_op.cc
│   │   │   ├── np_delete_op.cu
│   │   │   ├── np_diff-inl.h
│   │   │   ├── np_diff.cc
│   │   │   ├── np_diff.cu
│   │   │   ├── np_dot-inl.h
│   │   │   ├── np_dot_backward.cc
│   │   │   ├── np_dot_backward.cu
│   │   │   ├── np_dot_forward.cc
│   │   │   ├── np_dot_forward.cu
│   │   │   ├── np_ediff1d_op-inl.h
│   │   │   ├── np_ediff1d_op.cc
│   │   │   ├── np_ediff1d_op.cu
│   │   │   ├── np_einsum_op-inl.h
│   │   │   ├── np_einsum_op.cc
│   │   │   ├── np_einsum_op.cu
│   │   │   ├── np_einsum_path_op-inl.h
│   │   │   ├── np_elemwise_broadcast_logic_op.h
│   │   │   ├── np_elemwise_broadcast_logic_op_and.cc
│   │   │   ├── np_elemwise_broadcast_logic_op_and.cu
│   │   │   ├── np_elemwise_broadcast_logic_op_equal.cc
│   │   │   ├── np_elemwise_broadcast_logic_op_equal.cu
│   │   │   ├── np_elemwise_broadcast_logic_op_greater.cc
│   │   │   ├── np_elemwise_broadcast_logic_op_greater.cu
│   │   │   ├── np_elemwise_broadcast_logic_op_greater_equal.cc
│   │   │   ├── np_elemwise_broadcast_logic_op_greater_equal.cu
│   │   │   ├── np_elemwise_broadcast_logic_op_less.cc
│   │   │   ├── np_elemwise_broadcast_logic_op_less.cu
│   │   │   ├── np_elemwise_broadcast_logic_op_less_equal.cc
│   │   │   ├── np_elemwise_broadcast_logic_op_less_equal.cu
│   │   │   ├── np_elemwise_broadcast_logic_op_not_equal.cc
│   │   │   ├── np_elemwise_broadcast_logic_op_not_equal.cu
│   │   │   ├── np_elemwise_broadcast_logic_op_or.cc
│   │   │   ├── np_elemwise_broadcast_logic_op_or.cu
│   │   │   ├── np_elemwise_broadcast_logic_op_xor.cc
│   │   │   ├── np_elemwise_broadcast_logic_op_xor.cu
│   │   │   ├── np_elemwise_broadcast_op.h
│   │   │   ├── np_elemwise_broadcast_op_add.cc
│   │   │   ├── np_elemwise_broadcast_op_add.cu
│   │   │   ├── np_elemwise_broadcast_op_extended.cc
│   │   │   ├── np_elemwise_broadcast_op_extended.cu
│   │   │   ├── np_elemwise_broadcast_op_extended_sec.cc
│   │   │   ├── np_elemwise_broadcast_op_extended_sec.cu
│   │   │   ├── np_elemwise_broadcast_op_extended_thi.cc
│   │   │   ├── np_elemwise_broadcast_op_extended_thi.cu
│   │   │   ├── np_elemwise_broadcast_op_lae.cc
│   │   │   ├── np_elemwise_broadcast_op_lae.cu
│   │   │   ├── np_elemwise_broadcast_op_mod.cc
│   │   │   ├── np_elemwise_broadcast_op_mod.cu
│   │   │   ├── np_elemwise_broadcast_op_mul.cc
│   │   │   ├── np_elemwise_broadcast_op_mul.cu
│   │   │   ├── np_elemwise_broadcast_op_pow.cc
│   │   │   ├── np_elemwise_broadcast_op_pow.cu
│   │   │   ├── np_elemwise_broadcast_op_scalar.cc
│   │   │   ├── np_elemwise_broadcast_op_scalar.cu
│   │   │   ├── np_elemwise_broadcast_op_sub.cc
│   │   │   ├── np_elemwise_broadcast_op_sub.cu
│   │   │   ├── np_elemwise_unary_op_basic.cc
│   │   │   ├── np_elemwise_unary_op_basic.cu
│   │   │   ├── np_fill_diagonal_op-inl.h
│   │   │   ├── np_fill_diagonal_op.cc
│   │   │   ├── np_fill_diagonal_op.cu
│   │   │   ├── np_floor_divide.cc
│   │   │   ├── np_floor_divide.cu
│   │   │   ├── np_indexing_op.cc
│   │   │   ├── np_indexing_op.cu
│   │   │   ├── np_indexing_op.h
│   │   │   ├── np_init_op.cc
│   │   │   ├── np_init_op.cu
│   │   │   ├── np_init_op.h
│   │   │   ├── np_insert_op-inl.h
│   │   │   ├── np_insert_op_scalar-inl.h
│   │   │   ├── np_insert_op_scalar.cc
│   │   │   ├── np_insert_op_scalar.cu
│   │   │   ├── np_insert_op_slice-inl.h
│   │   │   ├── np_insert_op_slice.cc
│   │   │   ├── np_insert_op_slice.cu
│   │   │   ├── np_insert_op_tensor-inl.h
│   │   │   ├── np_insert_op_tensor.cc
│   │   │   ├── np_insert_op_tensor.cu
│   │   │   ├── np_interp_op-inl.h
│   │   │   ├── np_interp_op.cc
│   │   │   ├── np_interp_op.cu
│   │   │   ├── np_kron-inl.h
│   │   │   ├── np_kron_backward.cc
│   │   │   ├── np_kron_backward.cu
│   │   │   ├── np_kron_forward.cc
│   │   │   ├── np_kron_forward.cu
│   │   │   ├── np_matmul_op-inl.h
│   │   │   ├── np_matmul_op.cc
│   │   │   ├── np_matmul_op.cu
│   │   │   ├── np_matrix_op-inl.h
│   │   │   ├── np_matrix_op.cc
│   │   │   ├── np_matrix_op.cu
│   │   │   ├── np_memory_op.cc
│   │   │   ├── np_memory_op.cu
│   │   │   ├── np_memory_op.h
│   │   │   ├── np_moments_op.cc
│   │   │   ├── np_moments_op.cu
│   │   │   ├── np_nonzero_op-inl.h
│   │   │   ├── np_nonzero_op.cc
│   │   │   ├── np_nonzero_op.cu
│   │   │   ├── np_pad_op-inl.h
│   │   │   ├── np_pad_op.cc
│   │   │   ├── np_pad_op.cu
│   │   │   ├── np_percentile_op-inl.h
│   │   │   ├── np_percentile_op.cc
│   │   │   ├── np_percentile_op.cu
│   │   │   ├── np_polynomial_op-inl.h
│   │   │   ├── np_polynomial_op.cc
│   │   │   ├── np_polynomial_op.cu
│   │   │   ├── np_repeat_op-inl.h
│   │   │   ├── np_repeat_op.cc
│   │   │   ├── np_repeat_op.cu
│   │   │   ├── np_tensordot_op-inl.h
│   │   │   ├── np_tensordot_op.cc
│   │   │   ├── np_tensordot_op.cu
│   │   │   ├── np_trace_op-inl.h
│   │   │   ├── np_trace_op.cc
│   │   │   ├── np_trace_op.cu
│   │   │   ├── np_tri_op-inl.h
│   │   │   ├── np_tri_op.cc
│   │   │   ├── np_tri_op.cu
│   │   │   ├── np_tril_op-inl.h
│   │   │   ├── np_tril_op.cc
│   │   │   ├── np_tril_op.cu
│   │   │   ├── np_triu_op-inl.h
│   │   │   ├── np_triu_op.cc
│   │   │   ├── np_triu_op.cu
│   │   │   ├── np_true_divide-inl.h
│   │   │   ├── np_true_divide.cc
│   │   │   ├── np_true_divide.cu
│   │   │   ├── np_unique_op.cc
│   │   │   ├── np_unique_op.cu
│   │   │   ├── np_unique_op.h
│   │   │   ├── np_where_backward_op.cc
│   │   │   ├── np_where_backward_op.cu
│   │   │   ├── np_where_forward_op.cc
│   │   │   ├── np_where_forward_op.cu
│   │   │   ├── np_where_op-inl.h
│   │   │   ├── np_window_op.cc
│   │   │   ├── np_window_op.cu
│   │   │   ├── np_window_op.h
│   │   │   └── random/
│   │   │       ├── dist_common.cc
│   │   │       ├── dist_common.cu
│   │   │       ├── dist_common.h
│   │   │       ├── np_bernoulli_op.cc
│   │   │       ├── np_bernoulli_op.cu
│   │   │       ├── np_bernoulli_op.h
│   │   │       ├── np_choice_op.cc
│   │   │       ├── np_choice_op.cu
│   │   │       ├── np_choice_op.h
│   │   │       ├── np_exponential_op.cc
│   │   │       ├── np_exponential_op.cu
│   │   │       ├── np_exponential_op.h
│   │   │       ├── np_gamma_op.cc
│   │   │       ├── np_gamma_op.cu
│   │   │       ├── np_gamma_op.h
│   │   │       ├── np_laplace_op.cc
│   │   │       ├── np_laplace_op.cu
│   │   │       ├── np_laplace_op.h
│   │   │       ├── np_location_scale_op.cc
│   │   │       ├── np_location_scale_op.cu
│   │   │       ├── np_location_scale_op.h
│   │   │       ├── np_multinomial_op.cc
│   │   │       ├── np_multinomial_op.cu
│   │   │       ├── np_multinomial_op.h
│   │   │       ├── np_normal_op.cc
│   │   │       ├── np_normal_op.cu
│   │   │       ├── np_normal_op.h
│   │   │       ├── np_pareto_op.cc
│   │   │       ├── np_pareto_op.cu
│   │   │       ├── np_pareto_op.h
│   │   │       ├── np_power_op.cc
│   │   │       ├── np_power_op.cu
│   │   │       ├── np_power_op.h
│   │   │       ├── np_rayleigh_op.cc
│   │   │       ├── np_rayleigh_op.cu
│   │   │       ├── np_rayleigh_op.h
│   │   │       ├── np_uniform_op.cc
│   │   │       ├── np_uniform_op.cu
│   │   │       ├── np_uniform_op.h
│   │   │       ├── np_weibull_op.cc
│   │   │       ├── np_weibull_op.cu
│   │   │       └── np_weibull_op.h
│   │   ├── operator.cc
│   │   ├── operator_common.h
│   │   ├── operator_tune-inl.h
│   │   ├── operator_tune.cc
│   │   ├── operator_tune.h
│   │   ├── operator_util.cc
│   │   ├── optimizer_op-inl.h
│   │   ├── optimizer_op.cc
│   │   ├── optimizer_op.cu
│   │   ├── pad-inl.h
│   │   ├── pad.cc
│   │   ├── pad.cu
│   │   ├── quantization/
│   │   │   ├── calibrate-inl.h
│   │   │   ├── calibrate.cc
│   │   │   ├── dequantize-inl.h
│   │   │   ├── dequantize.cc
│   │   │   ├── dequantize.cu
│   │   │   ├── dnnl/
│   │   │   │   ├── dnnl_dequantize-inl.h
│   │   │   │   ├── dnnl_quantize-inl.h
│   │   │   │   ├── dnnl_quantize_asym-inl.h
│   │   │   │   ├── dnnl_quantize_v2-inl.h
│   │   │   │   ├── dnnl_quantized_act.cc
│   │   │   │   ├── dnnl_quantized_batch_norm.cc
│   │   │   │   ├── dnnl_quantized_concat.cc
│   │   │   │   ├── dnnl_quantized_conv.cc
│   │   │   │   ├── dnnl_quantized_elemwise_add.cc
│   │   │   │   ├── dnnl_quantized_flatten.cc
│   │   │   │   ├── dnnl_quantized_fully_connected.cc
│   │   │   │   ├── dnnl_quantized_ops-inl.h
│   │   │   │   ├── dnnl_quantized_pooling.cc
│   │   │   │   ├── dnnl_quantized_reshape.cc
│   │   │   │   ├── dnnl_quantized_rnn-inl.h
│   │   │   │   ├── dnnl_quantized_rnn.cc
│   │   │   │   ├── dnnl_quantized_transpose.cc
│   │   │   │   └── dnnl_requantize-inl.h
│   │   │   ├── quantization_utils.h
│   │   │   ├── quantize-inl.h
│   │   │   ├── quantize.cc
│   │   │   ├── quantize.cu
│   │   │   ├── quantize_asym-inl.h
│   │   │   ├── quantize_asym.cc
│   │   │   ├── quantize_graph_pass.cc
│   │   │   ├── quantize_v2-inl.h
│   │   │   ├── quantize_v2.cc
│   │   │   ├── quantize_v2.cu
│   │   │   ├── quantized_activation.cc
│   │   │   ├── quantized_batch_norm.cc
│   │   │   ├── quantized_batch_norm_relu.cc
│   │   │   ├── quantized_concat.cc
│   │   │   ├── quantized_conv.cc
│   │   │   ├── quantized_conv.cu
│   │   │   ├── quantized_elemwise_add-inl.h
│   │   │   ├── quantized_elemwise_add.cc
│   │   │   ├── quantized_elemwise_mul-inl.h
│   │   │   ├── quantized_elemwise_mul.cc
│   │   │   ├── quantized_flatten-inl.h
│   │   │   ├── quantized_flatten.cc
│   │   │   ├── quantized_flatten.cu
│   │   │   ├── quantized_fully_connected.cc
│   │   │   ├── quantized_fully_connected.cu
│   │   │   ├── quantized_indexing_op.cc
│   │   │   ├── quantized_pooling.cc
│   │   │   ├── quantized_pooling.cu
│   │   │   ├── quantized_reshape-inl.h
│   │   │   ├── quantized_reshape.cc
│   │   │   ├── quantized_rnn-inl.h
│   │   │   ├── quantized_rnn.cc
│   │   │   ├── quantized_transpose.cc
│   │   │   ├── requantize-inl.h
│   │   │   ├── requantize.cc
│   │   │   └── requantize.cu
│   │   ├── random/
│   │   │   ├── multisample_op.cc
│   │   │   ├── multisample_op.cu
│   │   │   ├── multisample_op.h
│   │   │   ├── pdf_op.cc
│   │   │   ├── pdf_op.cu
│   │   │   ├── pdf_op.h
│   │   │   ├── sample_multinomial_op.cc
│   │   │   ├── sample_multinomial_op.cu
│   │   │   ├── sample_multinomial_op.h
│   │   │   ├── sample_op.cc
│   │   │   ├── sample_op.cu
│   │   │   ├── sample_op.h
│   │   │   ├── sampler.h
│   │   │   ├── shuffle_op.cc
│   │   │   ├── shuffle_op.cu
│   │   │   ├── unique_sample_op.cc
│   │   │   └── unique_sample_op.h
│   │   ├── regression_output-inl.h
│   │   ├── regression_output.cc
│   │   ├── regression_output.cu
│   │   ├── rnn-inl.h
│   │   ├── rnn.cc
│   │   ├── rnn.cu
│   │   ├── rnn_impl.h
│   │   ├── roi_pooling-inl.h
│   │   ├── roi_pooling.cc
│   │   ├── roi_pooling.cu
│   │   ├── sequence_last-inl.h
│   │   ├── sequence_last.cc
│   │   ├── sequence_last.cu
│   │   ├── sequence_mask-inl.h
│   │   ├── sequence_mask.cc
│   │   ├── sequence_mask.cu
│   │   ├── sequence_op_common.h
│   │   ├── sequence_reverse-inl.h
│   │   ├── sequence_reverse.cc
│   │   ├── sequence_reverse.cu
│   │   ├── slice_channel-inl.h
│   │   ├── slice_channel.cc
│   │   ├── slice_channel.cu
│   │   ├── softmax_output-inl.h
│   │   ├── softmax_output.cc
│   │   ├── softmax_output.cu
│   │   ├── spatial_transformer-inl.h
│   │   ├── spatial_transformer.cc
│   │   ├── spatial_transformer.cu
│   │   ├── special_functions-inl.h
│   │   ├── subgraph/
│   │   │   ├── build_subgraph.cc
│   │   │   ├── common.h
│   │   │   ├── default_subgraph_property.cc
│   │   │   ├── default_subgraph_property_v2.cc
│   │   │   ├── dnnl/
│   │   │   │   ├── dnnl_batch_dot.cc
│   │   │   │   ├── dnnl_batch_dot_property.h
│   │   │   │   ├── dnnl_bn_relu.cc
│   │   │   │   ├── dnnl_bn_relu_property.h
│   │   │   │   ├── dnnl_common.h
│   │   │   │   ├── dnnl_conv-inl.h
│   │   │   │   ├── dnnl_conv.cc
│   │   │   │   ├── dnnl_conv_property.h
│   │   │   │   ├── dnnl_fc-inl.h
│   │   │   │   ├── dnnl_fc.cc
│   │   │   │   ├── dnnl_fc_property.h
│   │   │   │   ├── dnnl_fc_sum_fuse_property.h
│   │   │   │   ├── dnnl_identity_property.h
│   │   │   │   ├── dnnl_post_amp_property.h
│   │   │   │   ├── dnnl_post_quantize_align_scale_property.h
│   │   │   │   ├── dnnl_post_quantize_property.h
│   │   │   │   ├── dnnl_pow_mul_scalar.cc
│   │   │   │   ├── dnnl_pow_mul_scalar_property.h
│   │   │   │   ├── dnnl_remove_casts_property.h
│   │   │   │   ├── dnnl_subgraph_base-inl.h
│   │   │   │   ├── dnnl_subgraph_property.cc
│   │   │   │   ├── dnnl_transformer-inl.h
│   │   │   │   ├── dnnl_transformer.cc
│   │   │   │   ├── dnnl_transformer_qk_common.h
│   │   │   │   ├── dnnl_transformer_qk_property.h
│   │   │   │   └── dnnl_transformer_valatt_property.h
│   │   │   ├── eliminate_common_nodes_pass.cc
│   │   │   ├── partitioner/
│   │   │   │   └── custom_subgraph_property.h
│   │   │   ├── static_shape_subgraph_property.cc
│   │   │   ├── subgraph_property.h
│   │   │   └── tensorrt/
│   │   │       ├── nnvm_to_onnx-inl.h
│   │   │       ├── nnvm_to_onnx.cc
│   │   │       ├── onnx_to_tensorrt.cc
│   │   │       ├── onnx_to_tensorrt.h
│   │   │       ├── tensorrt-inl.h
│   │   │       ├── tensorrt.cc
│   │   │       └── tensorrt.cu
│   │   ├── subgraph_op_common.cc
│   │   ├── subgraph_op_common.h
│   │   ├── svm_output-inl.h
│   │   ├── svm_output.cc
│   │   ├── svm_output.cu
│   │   ├── swapaxis-inl.h
│   │   ├── swapaxis.cc
│   │   ├── swapaxis.cu
│   │   ├── tensor/
│   │   │   ├── amp_cast.cc
│   │   │   ├── amp_cast.cu
│   │   │   ├── amp_cast.h
│   │   │   ├── broadcast_reduce-inl.h
│   │   │   ├── broadcast_reduce_minmax_value.cc
│   │   │   ├── broadcast_reduce_minmax_value.cu
│   │   │   ├── broadcast_reduce_norm_value.cc
│   │   │   ├── broadcast_reduce_norm_value.cu
│   │   │   ├── broadcast_reduce_op.cc
│   │   │   ├── broadcast_reduce_op.h
│   │   │   ├── broadcast_reduce_op_index.cc
│   │   │   ├── broadcast_reduce_op_index.cu
│   │   │   ├── broadcast_reduce_op_value.cc
│   │   │   ├── broadcast_reduce_op_value.cu
│   │   │   ├── broadcast_reduce_prod_value.cc
│   │   │   ├── broadcast_reduce_prod_value.cu
│   │   │   ├── broadcast_reduce_sum_value.cc
│   │   │   ├── broadcast_reduce_sum_value.cu
│   │   │   ├── cast_storage-inl.cuh
│   │   │   ├── cast_storage-inl.h
│   │   │   ├── cast_storage.cc
│   │   │   ├── cast_storage.cu
│   │   │   ├── control_flow_op.cc
│   │   │   ├── control_flow_op.cu
│   │   │   ├── control_flow_op.h
│   │   │   ├── diag_op-inl.h
│   │   │   ├── diag_op.cc
│   │   │   ├── diag_op.cu
│   │   │   ├── dot-inl.cuh
│   │   │   ├── dot-inl.h
│   │   │   ├── dot.cc
│   │   │   ├── dot.cu
│   │   │   ├── elemwise_binary_broadcast_op.cc
│   │   │   ├── elemwise_binary_broadcast_op.h
│   │   │   ├── elemwise_binary_broadcast_op_basic.cc
│   │   │   ├── elemwise_binary_broadcast_op_basic.cu
│   │   │   ├── elemwise_binary_broadcast_op_extended.cc
│   │   │   ├── elemwise_binary_broadcast_op_extended.cu
│   │   │   ├── elemwise_binary_broadcast_op_logic.cc
│   │   │   ├── elemwise_binary_broadcast_op_logic.cu
│   │   │   ├── elemwise_binary_op-inl.h
│   │   │   ├── elemwise_binary_op.cc
│   │   │   ├── elemwise_binary_op.h
│   │   │   ├── elemwise_binary_op_basic.cc
│   │   │   ├── elemwise_binary_op_basic.cu
│   │   │   ├── elemwise_binary_op_extended.cc
│   │   │   ├── elemwise_binary_op_extended.cu
│   │   │   ├── elemwise_binary_op_logic.cc
│   │   │   ├── elemwise_binary_op_logic.cu
│   │   │   ├── elemwise_binary_scalar_op.cc
│   │   │   ├── elemwise_binary_scalar_op.h
│   │   │   ├── elemwise_binary_scalar_op_basic.cc
│   │   │   ├── elemwise_binary_scalar_op_basic.cu
│   │   │   ├── elemwise_binary_scalar_op_extended.cc
│   │   │   ├── elemwise_binary_scalar_op_extended.cu
│   │   │   ├── elemwise_binary_scalar_op_logic.cc
│   │   │   ├── elemwise_binary_scalar_op_logic.cu
│   │   │   ├── elemwise_sum.cc
│   │   │   ├── elemwise_sum.cu
│   │   │   ├── elemwise_sum.h
│   │   │   ├── elemwise_unary_op.cc
│   │   │   ├── elemwise_unary_op.h
│   │   │   ├── elemwise_unary_op_basic.cc
│   │   │   ├── elemwise_unary_op_basic.cu
│   │   │   ├── elemwise_unary_op_logexp.cc
│   │   │   ├── elemwise_unary_op_logexp.cu
│   │   │   ├── elemwise_unary_op_pow.cc
│   │   │   ├── elemwise_unary_op_pow.cu
│   │   │   ├── elemwise_unary_op_trig.cc
│   │   │   ├── elemwise_unary_op_trig.cu
│   │   │   ├── histogram-inl.h
│   │   │   ├── histogram.cc
│   │   │   ├── histogram.cu
│   │   │   ├── index_add-inl.h
│   │   │   ├── index_add_backward.cc
│   │   │   ├── index_add_backward.cu
│   │   │   ├── index_add_forward.cc
│   │   │   ├── index_add_forward.cu
│   │   │   ├── index_update-inl.h
│   │   │   ├── index_update.cc
│   │   │   ├── index_update.cu
│   │   │   ├── indexing_op-inl.cuh
│   │   │   ├── indexing_op.cc
│   │   │   ├── indexing_op.cu
│   │   │   ├── indexing_op.h
│   │   │   ├── init_op.cc
│   │   │   ├── init_op.cu
│   │   │   ├── init_op.h
│   │   │   ├── la_op-inl.h
│   │   │   ├── la_op.cc
│   │   │   ├── la_op.cu
│   │   │   ├── la_op.h
│   │   │   ├── matrix_op-inl.h
│   │   │   ├── matrix_op.cc
│   │   │   ├── matrix_op.cu
│   │   │   ├── ordering_op-inl.h
│   │   │   ├── ordering_op.cc
│   │   │   ├── ordering_op.cu
│   │   │   ├── pseudo2DTranspose_op-inl.cuh
│   │   │   ├── ravel.cc
│   │   │   ├── ravel.cu
│   │   │   ├── ravel.h
│   │   │   ├── reduce_rtc.cc
│   │   │   ├── slice-inl.h
│   │   │   ├── sort_op-inl.cuh
│   │   │   ├── sort_op.h
│   │   │   ├── sparse_retain-inl.h
│   │   │   ├── sparse_retain.cc
│   │   │   ├── sparse_retain.cu
│   │   │   ├── square_sum-inl.h
│   │   │   ├── square_sum.cc
│   │   │   ├── square_sum.cu
│   │   │   └── util/
│   │   │       ├── tensor_util-inl.cuh
│   │   │       └── tensor_util-inl.h
│   │   └── tvmop/
│   │       ├── op_module.cc
│   │       └── op_module.h
│   ├── optimizer/
│   │   └── sgd-inl.h
│   ├── profiler/
│   │   ├── aggregate_stats.cc
│   │   ├── aggregate_stats.h
│   │   ├── custom_op_profiler.h
│   │   ├── profiler.cc
│   │   ├── profiler.h
│   │   ├── storage_profiler.cc
│   │   ├── storage_profiler.h
│   │   ├── vtune.cc
│   │   └── vtune.h
│   ├── resource.cc
│   ├── runtime/
│   │   ├── c_runtime_api.cc
│   │   ├── container.cc
│   │   ├── ndarray_handle.cc
│   │   ├── object.cc
│   │   ├── object_internal.h
│   │   └── registry.cc
│   ├── serialization/
│   │   ├── cnpy.cc
│   │   └── cnpy.h
│   └── storage/
│       ├── cpu_device_storage.h
│       ├── cpu_shared_storage_manager.h
│       ├── gpu_device_storage.h
│       ├── naive_storage_manager.h
│       ├── pinned_memory_storage.h
│       ├── pooled_storage_manager.h
│       ├── storage.cc
│       ├── storage_manager.h
│       └── storage_manager_helpers.h
├── tests/
│   ├── .gitignore
│   ├── CMakeLists.txt
│   ├── README.md
│   ├── cpp/
│   │   ├── .gitignore
│   │   ├── engine/
│   │   │   ├── engine_shutdown_test.cc
│   │   │   ├── omp_test.cc
│   │   │   ├── thread_local_test.cc
│   │   │   └── threaded_engine_test.cc
│   │   ├── include/
│   │   │   ├── test_core_op.h
│   │   │   ├── test_dnnl.h
│   │   │   ├── test_legacy_op.h
│   │   │   ├── test_ndarray_utils.h
│   │   │   ├── test_op.h
│   │   │   ├── test_op_runner.h
│   │   │   ├── test_perf.h
│   │   │   ├── test_tune.h
│   │   │   └── test_util.h
│   │   ├── kvstore/
│   │   │   └── gpu_topology_test.cc
│   │   ├── misc/
│   │   │   ├── base.cc
│   │   │   └── libinfo_test.cc
│   │   ├── operator/
│   │   │   ├── activation_perf.cc
│   │   │   ├── batchnorm_test.cc
│   │   │   ├── coreop_perf.cc
│   │   │   ├── dnnl_operator_test.cc
│   │   │   ├── dnnl_test.cc
│   │   │   ├── dropout_perf.cc
│   │   │   ├── fully_conn_perf.cc
│   │   │   ├── krprod_test.cc
│   │   │   ├── runner/
│   │   │   │   └── core_op_runner_test.cc
│   │   │   ├── slice_channel_perf.cc
│   │   │   └── tune/
│   │   │       └── operator_tune_test.cc
│   │   ├── storage/
│   │   │   └── storage_test.cc
│   │   └── test_main.cc
│   ├── nightly/
│   │   ├── .gitignore
│   │   ├── Jenkinsfile
│   │   ├── JenkinsfileForBinaries
│   │   ├── README.md
│   │   ├── TestDoc/
│   │   │   ├── doc_spell_checker.py
│   │   │   └── doc_spell_grammar.sh
│   │   ├── common.py
│   │   ├── dist_async_kvstore.py
│   │   ├── dist_device_sync_kvstore.py
│   │   ├── dist_device_sync_kvstore_byteps.py
│   │   ├── dist_device_sync_kvstore_custom.py
│   │   ├── dist_device_sync_kvstore_horovod.py
│   │   ├── dist_sync_kvstore.py
│   │   ├── estimator/
│   │   │   ├── test_estimator_cnn.py
│   │   │   └── test_sentiment_rnn.py
│   │   ├── model_backwards_compatibility_check/
│   │   │   ├── JenkinsfileForMBCC
│   │   │   ├── README.md
│   │   │   ├── __init__.py
│   │   │   ├── common.py
│   │   │   ├── model_backward_compat_checker.sh
│   │   │   ├── model_backwards_compat_inference.py
│   │   │   ├── model_backwards_compat_train.py
│   │   │   ├── train_mxnet_legacy_models.sh
│   │   │   └── upload_models_to_s3.sh
│   │   ├── test_distributed_training-gpu.sh
│   │   ├── test_kvstore.py
│   │   ├── test_large_array.py
│   │   ├── test_large_vector.py
│   │   ├── test_np_large_array.py
│   │   ├── test_np_random.py
│   │   └── test_server_profiling.py
│   ├── python/
│   │   ├── README.md
│   │   ├── amp/
│   │   │   └── common.py
│   │   ├── array-api/
│   │   │   └── test_data_interchange.py
│   │   ├── common/
│   │   │   └── models.py
│   │   ├── conftest.py
│   │   ├── dnnl/
│   │   │   ├── op_cfg.py
│   │   │   ├── subgraphs/
│   │   │   │   ├── subgraph_common.py
│   │   │   │   ├── test_amp_subgraph.py
│   │   │   │   ├── test_conv_subgraph.py
│   │   │   │   ├── test_fc_subgraph.py
│   │   │   │   ├── test_matmul_subgraph.py
│   │   │   │   └── test_pow_mul_subgraph.py
│   │   │   ├── test_amp.py
│   │   │   ├── test_bf16_operator.py
│   │   │   ├── test_dnnl.py
│   │   │   └── test_quantization_dnnl.py
│   │   ├── doctest/
│   │   │   └── test_docstring.py
│   │   ├── gpu/
│   │   │   ├── test_amp.py
│   │   │   ├── test_amp_init.py
│   │   │   ├── test_deferred_compute_gpu.py
│   │   │   ├── test_device.py
│   │   │   ├── test_extensions_gpu.py
│   │   │   ├── test_fusion.py
│   │   │   ├── test_gluon_gpu.py
│   │   │   ├── test_gluon_model_zoo_gpu.py
│   │   │   ├── test_gluon_transforms.py
│   │   │   ├── test_kvstore_gpu.py
│   │   │   ├── test_nccl.py
│   │   │   ├── test_numpy_einsum.py
│   │   │   ├── test_numpy_fallback.py
│   │   │   ├── test_operator_gpu.py
│   │   │   ├── test_profiler_gpu.py
│   │   │   ├── test_rtc.py
│   │   │   ├── test_tvm_bridge.py
│   │   │   └── test_tvm_op_gpu.py
│   │   ├── onnx/
│   │   │   ├── test_models.py
│   │   │   └── test_operators.py
│   │   ├── profiling/
│   │   │   ├── simple_forward.py
│   │   │   └── test_nvtx.py
│   │   ├── quantization/
│   │   │   └── test_quantization.py
│   │   ├── test_quantization_gpu.py
│   │   ├── train/
│   │   │   ├── common.py
│   │   │   └── test_autograd.py
│   │   └── unittest/
│   │       ├── common.py
│   │       ├── legacy_ndarray.v0
│   │       ├── test_attr.py
│   │       ├── test_autograd.py
│   │       ├── test_base.py
│   │       ├── test_contrib_control_flow.py
│   │       ├── test_contrib_gluon_data_vision.py
│   │       ├── test_contrib_hawkesll.py
│   │       ├── test_contrib_intgemm.py
│   │       ├── test_contrib_io.py
│   │       ├── test_contrib_krprod.py
│   │       ├── test_contrib_operator.py
│   │       ├── test_contrib_optimizer.py
│   │       ├── test_contrib_stes_op.py
│   │       ├── test_deferred_compute.py
│   │       ├── test_dgl_graph.py
│   │       ├── test_dynamic_shape.py
│   │       ├── test_engine.py
│   │       ├── test_engine_import.py
│   │       ├── test_exc_handling.py
│   │       ├── test_executor.py
│   │       ├── test_extensions.py
│   │       ├── test_ffi_container.py
│   │       ├── test_gluon.py
│   │       ├── test_gluon_batch_processor.py
│   │       ├── test_gluon_control_flow.py
│   │       ├── test_gluon_data.py
│   │       ├── test_gluon_estimator.py
│   │       ├── test_gluon_event_handler.py
│   │       ├── test_gluon_indexing.py
│   │       ├── test_gluon_model_zoo.py
│   │       ├── test_gluon_probability_v2.py
│   │       ├── test_gluon_rnn.py
│   │       ├── test_gluon_save.py
│   │       ├── test_gluon_trainer.py
│   │       ├── test_gluon_utils.py
│   │       ├── test_higher_order_grad.py
│   │       ├── test_image.py
│   │       ├── test_infer_shape.py
│   │       ├── test_infer_type.py
│   │       ├── test_io.py
│   │       ├── test_kvstore.py
│   │       ├── test_kvstore_custom.py
│   │       ├── test_loss.py
│   │       ├── test_memory_opt.py
│   │       ├── test_metric.py
│   │       ├── test_ndarray.py
│   │       ├── test_numpy_contrib_gluon_data_vision.py
│   │       ├── test_numpy_default_dtype.py
│   │       ├── test_numpy_gluon.py
│   │       ├── test_numpy_gluon_data_vision.py
│   │       ├── test_numpy_interoperability.py
│   │       ├── test_numpy_loss.py
│   │       ├── test_numpy_ndarray.py
│   │       ├── test_numpy_op.py
│   │       ├── test_operator.py
│   │       ├── test_optimizer.py
│   │       ├── test_profiler.py
│   │       ├── test_random.py
│   │       ├── test_recordio.py
│   │       ├── test_runtime.py
│   │       ├── test_smoke.py
│   │       ├── test_sparse_ndarray.py
│   │       ├── test_sparse_operator.py
│   │       ├── test_subgraph.py
│   │       ├── test_subgraph_op.py
│   │       ├── test_symbol.py
│   │       ├── test_test_utils.py
│   │       ├── test_thread_local.py
│   │       ├── test_tvm_op.py
│   │       └── test_viz.py
│   ├── tutorials/
│   │   ├── test_sanity_tutorials.py
│   │   └── test_tutorials.py
│   └── utils/
│       └── notebook_test/
│           └── __init__.py
└── tools/
    ├── bandwidth/
    │   ├── .gitignore
    │   ├── README.md
    │   ├── measure.py
    │   └── test_measure.py
    ├── cfn/
    │   └── Readme.md
    ├── create_source_archive.sh
    ├── dependencies/
    │   ├── LICENSE.binary.dependencies
    │   ├── README.md
    │   ├── cityhash.sh
    │   ├── curl.sh
    │   ├── eigen.sh
    │   ├── libpng.sh
    │   ├── libtiff.sh
    │   ├── libturbojpeg.sh
    │   ├── libz.sh
    │   ├── lz4.sh
    │   ├── make_shared_dependencies.sh
    │   ├── mkl.sh
    │   ├── numpy_mkl.sh
    │   ├── openblas.sh
    │   ├── opencv.sh
    │   ├── openssl.sh
    │   ├── patch/
    │   │   └── opencv_lapack.h
    │   ├── protobuf.sh
    │   └── zmq.sh
    ├── diagnose.py
    ├── flakiness_checker.py
    ├── git-pre-commit
    ├── im2rec.cc
    ├── im2rec.py
    ├── ipynb2md.py
    ├── kill-mxnet.py
    ├── launch.py
    ├── license_header.py
    ├── lint/
    │   ├── clang_format_ci.sh
    │   └── git-clang-format-13
    ├── parse_log.py
    ├── pip/
    │   ├── MANIFEST.in
    │   ├── doc/
    │   │   ├── CPU_ADDITIONAL.md
    │   │   ├── CU101_ADDITIONAL.md
    │   │   ├── CU102_ADDITIONAL.md
    │   │   ├── CU110_ADDITIONAL.md
    │   │   ├── CU112_ADDITIONAL.md
    │   │   ├── NATIVE_ADDITIONAL.md
    │   │   └── PYPI_README.md
    │   ├── sanity_test.py
    │   └── setup.py
    ├── profile/
    │   └── tune_python.sh
    ├── rec2idx.py
    ├── staticbuild/
    │   ├── README.md
    │   ├── build.sh
    │   ├── build_lib.sh
    │   └── build_wheel.sh
    └── windowsbuild/
        ├── README.md
        ├── gen_warp.cpp
        └── warp_dll.cpp
Download .txt
Showing preview only (637K chars total). Download the full file or copy to clipboard to get everything.
SYMBOL INDEX (7970 symbols across 690 files)

FILE: 3rdparty/ctc_include/contrib/moderngpu/include/mgpuenums.h
  function namespace (line 37) | namespace mgpu {

FILE: 3rdparty/ctc_include/contrib/moderngpu/include/util/static.h
  function namespace (line 67) | namespace mgpu {

FILE: 3rdparty/ctc_include/detail/cpu_ctc.h
  function namespace (line 32) | namespace mxnet_warpctc {

FILE: 3rdparty/ctc_include/detail/ctc_helper.h
  type ctcStatus_t (line 28) | typedef enum {
  type ctcComputeLocation (line 36) | typedef enum {
  function namespace (line 41) | namespace ctc_helper {

FILE: 3rdparty/ctc_include/detail/gpu_ctc.h
  function namespace (line 26) | namespace mxnet_warpctc {

FILE: 3rdparty/ctc_include/detail/gpu_ctc_kernels.h
  type CTASegReduce (line 30) | struct CTASegReduce {
  type CTASegReduce (line 247) | typedef CTASegReduce<NT, VT, ProbT, int, ctc_helper::log_plus<ProbT>> Se...

FILE: 3rdparty/miniz/miniz.c
  function mz_ulong (line 39) | mz_ulong mz_adler32(mz_ulong adler, const unsigned char *ptr, size_t buf...
  function mz_ulong (line 69) | mz_ulong mz_crc32(mz_ulong crc, const mz_uint8 *ptr, size_t buf_len)
  function mz_ulong (line 88) | mz_ulong mz_crc32(mz_ulong crc, const mz_uint8 *ptr, size_t buf_len)
  function mz_free (line 155) | void mz_free(void *p)
  function miniz_def_free_func (line 165) | void miniz_def_free_func(void *opaque, void *address)
  function mz_deflateInit (line 183) | int mz_deflateInit(mz_streamp pStream, int level)
  function mz_deflateInit2 (line 188) | int mz_deflateInit2(mz_streamp pStream, int level, int method, int windo...
  function mz_deflateReset (line 224) | int mz_deflateReset(mz_streamp pStream)
  function mz_deflate (line 233) | int mz_deflate(mz_streamp pStream, int flush)
  function mz_deflateEnd (line 291) | int mz_deflateEnd(mz_streamp pStream)
  function mz_ulong (line 303) | mz_ulong mz_deflateBound(mz_streamp pStream, mz_ulong source_len)
  function mz_compress2 (line 310) | int mz_compress2(unsigned char *pDest, mz_ulong *pDest_len, const unsign...
  function mz_compress (line 340) | int mz_compress(unsigned char *pDest, mz_ulong *pDest_len, const unsigne...
  function mz_ulong (line 345) | mz_ulong mz_compressBound(mz_ulong source_len)
  type inflate_state (line 350) | typedef struct
  function mz_inflateInit2 (line 359) | int mz_inflateInit2(mz_streamp pStream, int window_bits)
  function mz_inflateInit (line 395) | int mz_inflateInit(mz_streamp pStream)
  function mz_inflateReset (line 400) | int mz_inflateReset(mz_streamp pStream)
  function mz_inflate (line 426) | int mz_inflate(mz_streamp pStream, int flush)
  function mz_inflateEnd (line 538) | int mz_inflateEnd(mz_streamp pStream)
  function mz_uncompress (line 550) | int mz_uncompress(unsigned char *pDest, mz_ulong *pDest_len, const unsig...
  type tdefl_sym_freq (line 728) | typedef struct
  function tdefl_sym_freq (line 732) | static tdefl_sym_freq *tdefl_radix_sort_syms(mz_uint num_syms, tdefl_sym...
  function tdefl_calculate_minimum_redundancy (line 766) | static void tdefl_calculate_minimum_redundancy(tdefl_sym_freq *A, int n)
  function tdefl_huffman_enforce_max_code_size (line 826) | static void tdefl_huffman_enforce_max_code_size(int *pNum_codes, int cod...
  function tdefl_optimize_huffman_table (line 850) | static void tdefl_optimize_huffman_table(tdefl_compressor *d, int table_...
  function tdefl_start_dynamic_block (line 969) | static void tdefl_start_dynamic_block(tdefl_compressor *d)
  function tdefl_start_static_block (line 1056) | static void tdefl_start_static_block(tdefl_compressor *d)
  function mz_bool (line 1081) | static mz_bool tdefl_compress_lz_codes(tdefl_compressor *d)
  function mz_bool (line 1175) | static mz_bool tdefl_compress_lz_codes(tdefl_compressor *d)
  function mz_bool (line 1223) | static mz_bool tdefl_compress_block(tdefl_compressor *d, mz_bool static_...
  function tdefl_flush_block (line 1232) | static int tdefl_flush_block(tdefl_compressor *d, int flush)
  function mz_uint16 (line 1369) | static mz_uint16 TDEFL_READ_UNALIGNED_WORD(const mz_uint8* p)
  function mz_uint16 (line 1375) | static mz_uint16 TDEFL_READ_UNALIGNED_WORD2(const mz_uint16* p)
  function MZ_FORCEINLINE (line 1385) | static MZ_FORCEINLINE void tdefl_find_match(tdefl_compressor *d, mz_uint...
  function MZ_FORCEINLINE (line 1438) | static MZ_FORCEINLINE void tdefl_find_match(tdefl_compressor *d, mz_uint...
  function mz_uint32 (line 1485) | static mz_uint32 TDEFL_READ_UNALIGNED_WORD32(const mz_uint8* p)
  function mz_bool (line 1494) | static mz_bool tdefl_compress_fast(tdefl_compressor *d)
  function MZ_FORCEINLINE (line 1668) | static MZ_FORCEINLINE void tdefl_record_literal(tdefl_compressor *d, mz_...
  function MZ_FORCEINLINE (line 1681) | static MZ_FORCEINLINE void tdefl_record_match(tdefl_compressor *d, mz_ui...
  function mz_bool (line 1711) | static mz_bool tdefl_compress_normal(tdefl_compressor *d)
  function tdefl_status (line 1856) | static tdefl_status tdefl_flush_output_buffer(tdefl_compressor *d)
  function tdefl_status (line 1877) | tdefl_status tdefl_compress(tdefl_compressor *d, const void *pIn_buf, si...
  function tdefl_status (line 1945) | tdefl_status tdefl_compress_buffer(tdefl_compressor *d, const void *pIn_...
  function tdefl_status (line 1951) | tdefl_status tdefl_init(tdefl_compressor *d, tdefl_put_buf_func_ptr pPut...
  function tdefl_status (line 1986) | tdefl_status tdefl_get_prev_return_status(tdefl_compressor *d)
  function mz_uint32 (line 1991) | mz_uint32 tdefl_get_adler32(tdefl_compressor *d)
  function mz_bool (line 1996) | mz_bool tdefl_compress_mem_to_output(const void *pBuf, size_t buf_len, t...
  type tdefl_output_buffer (line 2011) | typedef struct
  function mz_bool (line 2018) | static mz_bool tdefl_output_buffer_putter(const void *pBuf, int len, voi...
  function tdefl_compress_mem_to_mem (line 2058) | size_t tdefl_compress_mem_to_mem(void *pOut_buf, size_t out_buf_len, con...
  function mz_uint (line 2074) | mz_uint tdefl_create_comp_flags_from_zip_params(int level, int window_bi...
  function tdefl_compressor (line 2190) | tdefl_compressor *tdefl_compressor_alloc()
  function tdefl_compressor_free (line 2195) | void tdefl_compressor_free(tdefl_compressor *pComp)
  function tinfl_status (line 2381) | tinfl_status tinfl_decompress(tinfl_decompressor *r, const mz_uint8 *pIn...
  function tinfl_decompress_mem_to_mem (line 2892) | size_t tinfl_decompress_mem_to_mem(void *pOut_buf, size_t out_buf_len, c...
  function tinfl_decompress_mem_to_callback (line 2901) | int tinfl_decompress_mem_to_callback(const void *pIn_buf, size_t *pIn_bu...
  function tinfl_decompressor (line 2931) | tinfl_decompressor *tinfl_decompressor_alloc()
  function tinfl_decompressor_free (line 2939) | void tinfl_decompressor_free(tinfl_decompressor *pDecomp)
  function FILE (line 2990) | static FILE *mz_fopen(const char *pFilename, const char *pMode)
  function FILE (line 2996) | static FILE *mz_freopen(const char *pPath, const char *pMode, FILE *pStr...
  type mz_zip_array (line 3194) | typedef struct
  type mz_zip_internal_state_tag (line 3201) | struct mz_zip_internal_state_tag
  function MZ_FORCEINLINE (line 3228) | static MZ_FORCEINLINE mz_uint mz_zip_array_range_check(const mz_zip_arra...
  function MZ_FORCEINLINE (line 3238) | static MZ_FORCEINLINE void mz_zip_array_init(mz_zip_array *pArray, mz_ui...
  function MZ_FORCEINLINE (line 3244) | static MZ_FORCEINLINE void mz_zip_array_clear(mz_zip_archive *pZip, mz_z...
  function mz_bool (line 3250) | static mz_bool mz_zip_array_ensure_capacity(mz_zip_archive *pZip, mz_zip...
  function MZ_FORCEINLINE (line 3270) | static MZ_FORCEINLINE mz_bool mz_zip_array_reserve(mz_zip_archive *pZip,...
  function MZ_FORCEINLINE (line 3280) | static MZ_FORCEINLINE mz_bool mz_zip_array_resize(mz_zip_archive *pZip, ...
  function MZ_FORCEINLINE (line 3291) | static MZ_FORCEINLINE mz_bool mz_zip_array_ensure_room(mz_zip_archive *p...
  function MZ_FORCEINLINE (line 3296) | static MZ_FORCEINLINE mz_bool mz_zip_array_push_back(mz_zip_archive *pZi...
  function MZ_TIME_T (line 3307) | static MZ_TIME_T mz_zip_dos_to_time_t(int dos_time, int dos_date)
  function mz_zip_time_t_to_dos_time (line 3322) | static void mz_zip_time_t_to_dos_time(MZ_TIME_T time, mz_uint16 *pDOS_ti...
  function mz_bool (line 3345) | static mz_bool mz_zip_get_file_modified_time(const char *pFilename, MZ_T...
  function mz_bool (line 3359) | static mz_bool mz_zip_set_file_times(const char *pFilename, MZ_TIME_T ac...
  function MZ_FORCEINLINE (line 3372) | static MZ_FORCEINLINE mz_bool mz_zip_set_error(mz_zip_archive *pZip, mz_...
  function mz_bool (line 3379) | static mz_bool mz_zip_reader_init_internal(mz_zip_archive *pZip, mz_uint...
  function MZ_FORCEINLINE (line 3413) | static MZ_FORCEINLINE mz_bool mz_zip_reader_filename_less(const mz_zip_a...
  function mz_zip_reader_sort_central_dir_offsets_by_filename (line 3442) | static void mz_zip_reader_sort_central_dir_offsets_by_filename(mz_zip_ar...
  function mz_bool (line 3494) | static mz_bool mz_zip_reader_locate_header_sig(mz_zip_archive *pZip, mz_...
  function mz_bool (line 3540) | static mz_bool mz_zip_reader_read_central_dir(mz_zip_archive *pZip, mz_u...
  function mz_zip_zero_struct (line 3792) | void mz_zip_zero_struct(mz_zip_archive *pZip)
  function mz_bool (line 3798) | static mz_bool mz_zip_reader_end_internal(mz_zip_archive *pZip, mz_bool ...
  function mz_bool (line 3845) | mz_bool mz_zip_reader_end(mz_zip_archive *pZip)
  function mz_bool (line 3849) | mz_bool mz_zip_reader_init(mz_zip_archive *pZip, mz_uint64 size, mz_uint...
  function mz_zip_mem_read_func (line 3869) | static size_t mz_zip_mem_read_func(void *pOpaque, mz_uint64 file_ofs, vo...
  function mz_bool (line 3877) | mz_bool mz_zip_reader_init_mem(mz_zip_archive *pZip, const void *pMem, s...
  function mz_zip_file_read_func (line 3912) | static size_t mz_zip_file_read_func(void *pOpaque, mz_uint64 file_ofs, v...
  function mz_bool (line 3925) | mz_bool mz_zip_reader_init_file(mz_zip_archive *pZip, const char *pFilen...
  function mz_bool (line 3930) | mz_bool mz_zip_reader_init_file_v2(mz_zip_archive *pZip, const char *pFi...
  function mz_bool (line 3984) | mz_bool mz_zip_reader_init_cfile(mz_zip_archive *pZip, MZ_FILE *pFile, m...
  function MZ_FORCEINLINE (line 4026) | static MZ_FORCEINLINE const mz_uint8 *mz_zip_get_cdh(mz_zip_archive *pZi...
  function mz_bool (line 4033) | mz_bool mz_zip_reader_is_file_encrypted(mz_zip_archive *pZip, mz_uint fi...
  function mz_bool (line 4047) | mz_bool mz_zip_reader_is_file_supported(mz_zip_archive *pZip, mz_uint fi...
  function mz_bool (line 4083) | mz_bool mz_zip_reader_is_file_a_directory(mz_zip_archive *pZip, mz_uint ...
  function mz_bool (line 4115) | static mz_bool mz_zip_file_stat_internal(mz_zip_archive *pZip, mz_uint f...
  function MZ_FORCEINLINE (line 4235) | static MZ_FORCEINLINE mz_bool mz_zip_string_equal(const char *pA, const ...
  function MZ_FORCEINLINE (line 4246) | static MZ_FORCEINLINE int mz_zip_filename_compare(const mz_zip_array *pC...
  function mz_bool (line 4263) | static mz_bool mz_zip_locate_file_binary_search(mz_zip_archive *pZip, co...
  function mz_zip_reader_locate_file (line 4303) | int mz_zip_reader_locate_file(mz_zip_archive *pZip, const char *pName, c...
  function mz_bool (line 4312) | mz_bool mz_zip_reader_locate_file_v2(mz_zip_archive *pZip, const char *p...
  function mz_bool (line 4377) | mz_bool mz_zip_reader_extract_to_mem_no_alloc(mz_zip_archive *pZip, mz_u...
  function mz_bool (line 4520) | mz_bool mz_zip_reader_extract_file_to_mem_no_alloc(mz_zip_archive *pZip,...
  function mz_bool (line 4528) | mz_bool mz_zip_reader_extract_to_mem(mz_zip_archive *pZip, mz_uint file_...
  function mz_bool (line 4533) | mz_bool mz_zip_reader_extract_file_to_mem(mz_zip_archive *pZip, const ch...
  function mz_bool (line 4592) | mz_bool mz_zip_reader_extract_to_callback(mz_zip_archive *pZip, mz_uint ...
  function mz_bool (line 4790) | mz_bool mz_zip_reader_extract_file_to_callback(mz_zip_archive *pZip, con...
  function mz_zip_reader_extract_iter_state (line 4799) | mz_zip_reader_extract_iter_state* mz_zip_reader_extract_iter_new(mz_zip_...
  function mz_zip_reader_extract_iter_state (line 4927) | mz_zip_reader_extract_iter_state* mz_zip_reader_extract_file_iter_new(mz...
  function mz_zip_reader_extract_iter_read (line 4939) | size_t mz_zip_reader_extract_iter_read(mz_zip_reader_extract_iter_state*...
  function mz_bool (line 5056) | mz_bool mz_zip_reader_extract_iter_free(mz_zip_reader_extract_iter_state...
  function mz_zip_file_write_callback (line 5098) | static size_t mz_zip_file_write_callback(void *pOpaque, mz_uint64 ofs, c...
  function mz_bool (line 5105) | mz_bool mz_zip_reader_extract_to_file(mz_zip_archive *pZip, mz_uint file...
  function mz_bool (line 5139) | mz_bool mz_zip_reader_extract_file_to_file(mz_zip_archive *pZip, const c...
  function mz_bool (line 5148) | mz_bool mz_zip_reader_extract_to_cfile(mz_zip_archive *pZip, mz_uint fil...
  function mz_bool (line 5161) | mz_bool mz_zip_reader_extract_file_to_cfile(mz_zip_archive *pZip, const ...
  function mz_zip_compute_crc32_callback (line 5171) | static size_t mz_zip_compute_crc32_callback(void *pOpaque, mz_uint64 fil...
  function mz_bool (line 5179) | mz_bool mz_zip_validate_file(mz_zip_archive *pZip, mz_uint file_index, m...
  function mz_bool (line 5385) | mz_bool mz_zip_validate_archive(mz_zip_archive *pZip, mz_uint flags)
  function mz_bool (line 5438) | mz_bool mz_zip_validate_mem_archive(const void *pMem, size_t size, mz_ui...
  function mz_bool (line 5480) | mz_bool mz_zip_validate_file_archive(const char *pFilename, mz_uint flag...
  function MZ_FORCEINLINE (line 5526) | static MZ_FORCEINLINE void mz_write_le16(mz_uint8 *p, mz_uint16 v)
  function MZ_FORCEINLINE (line 5531) | static MZ_FORCEINLINE void mz_write_le32(mz_uint8 *p, mz_uint32 v)
  function MZ_FORCEINLINE (line 5538) | static MZ_FORCEINLINE void mz_write_le64(mz_uint8 *p, mz_uint64 v)
  function mz_zip_heap_write_func (line 5548) | static size_t mz_zip_heap_write_func(void *pOpaque, mz_uint64 file_ofs, ...
  function mz_bool (line 5586) | static mz_bool mz_zip_writer_end_internal(mz_zip_archive *pZip, mz_bool ...
  function mz_bool (line 5632) | mz_bool mz_zip_writer_init_v2(mz_zip_archive *pZip, mz_uint64 existing_s...
  function mz_bool (line 5681) | mz_bool mz_zip_writer_init(mz_zip_archive *pZip, mz_uint64 existing_size)
  function mz_bool (line 5686) | mz_bool mz_zip_writer_init_heap_v2(mz_zip_archive *pZip, size_t size_to_...
  function mz_bool (line 5714) | mz_bool mz_zip_writer_init_heap(mz_zip_archive *pZip, size_t size_to_res...
  function mz_zip_file_write_func (line 5720) | static size_t mz_zip_file_write_func(void *pOpaque, mz_uint64 file_ofs, ...
  function mz_bool (line 5736) | mz_bool mz_zip_writer_init_file(mz_zip_archive *pZip, const char *pFilen...
  function mz_bool (line 5741) | mz_bool mz_zip_writer_init_file_v2(mz_zip_archive *pZip, const char *pFi...
  function mz_bool (line 5788) | mz_bool mz_zip_writer_init_cfile(mz_zip_archive *pZip, MZ_FILE *pFile, m...
  function mz_bool (line 5809) | mz_bool mz_zip_writer_init_from_reader_v2(mz_zip_archive *pZip, const ch...
  function mz_bool (line 5896) | mz_bool mz_zip_writer_init_from_reader(mz_zip_archive *pZip, const char ...
  function mz_bool (line 5902) | mz_bool mz_zip_writer_add_mem(mz_zip_archive *pZip, const char *pArchive...
  type mz_zip_writer_add_state (line 5907) | typedef struct
  function mz_bool (line 5914) | static mz_bool mz_zip_writer_add_put_buf_callback(const void *pBuf, int ...
  function mz_uint32 (line 5927) | static mz_uint32 mz_zip_writer_create_zip64_extra_data(mz_uint8 *pBuf, m...
  function mz_bool (line 5962) | static mz_bool mz_zip_writer_create_local_dir_header(mz_zip_archive *pZi...
  function mz_bool (line 5980) | static mz_bool mz_zip_writer_create_central_dir_header(mz_zip_archive *p...
  function mz_bool (line 6005) | static mz_bool mz_zip_writer_add_to_central_dir(mz_zip_archive *pZip, co...
  function mz_bool (line 6045) | static mz_bool mz_zip_writer_validate_archive_name(const char *pArchive_...
  function mz_uint (line 6056) | static mz_uint mz_zip_writer_compute_padding_needed_for_file_alignment(m...
  function mz_bool (line 6065) | static mz_bool mz_zip_writer_write_zeros(mz_zip_archive *pZip, mz_uint64...
  function mz_bool (line 6081) | mz_bool mz_zip_writer_add_mem_ex(mz_zip_archive *pZip, const char *pArch...
  function mz_bool (line 6087) | mz_bool mz_zip_writer_add_mem_ex_v2(mz_zip_archive *pZip, const char *pA...
  function mz_bool (line 6373) | mz_bool mz_zip_writer_add_read_buf_callback(mz_zip_archive *pZip, const ...
  function mz_file_read_func_stdio (line 6678) | static size_t mz_file_read_func_stdio(void *pOpaque, mz_uint64 file_ofs,...
  function mz_bool (line 6689) | mz_bool mz_zip_writer_add_cfile(mz_zip_archive *pZip, const char *pArchi...
  function mz_bool (line 6696) | mz_bool mz_zip_writer_add_file(mz_zip_archive *pZip, const char *pArchiv...
  function mz_bool (line 6728) | static mz_bool mz_zip_writer_update_zip64_extension_block(mz_zip_array *...
  function mz_bool (line 6808) | mz_bool mz_zip_writer_add_from_zip_reader(mz_zip_archive *pZip, mz_zip_a...
  function mz_bool (line 7170) | mz_bool mz_zip_writer_finalize_archive(mz_zip_archive *pZip)
  function mz_bool (line 7258) | mz_bool mz_zip_writer_finalize_heap_archive(mz_zip_archive *pZip, void *...
  function mz_bool (line 7283) | mz_bool mz_zip_writer_end(mz_zip_archive *pZip)
  function mz_bool (line 7289) | mz_bool mz_zip_add_mem_to_archive_file_in_place(const char *pZip_filenam...
  function mz_bool (line 7294) | mz_bool mz_zip_add_mem_to_archive_file_in_place_v2(const char *pZip_file...
  function mz_zip_mode (line 7437) | mz_zip_mode mz_zip_get_mode(mz_zip_archive *pZip)
  function mz_zip_type (line 7442) | mz_zip_type mz_zip_get_type(mz_zip_archive *pZip)
  function mz_zip_error (line 7447) | mz_zip_error mz_zip_set_last_error(mz_zip_archive *pZip, mz_zip_error er...
  function mz_zip_error (line 7460) | mz_zip_error mz_zip_peek_last_error(mz_zip_archive *pZip)
  function mz_zip_error (line 7468) | mz_zip_error mz_zip_clear_last_error(mz_zip_archive *pZip)
  function mz_zip_error (line 7473) | mz_zip_error mz_zip_get_last_error(mz_zip_archive *pZip)
  function mz_bool (line 7562) | mz_bool mz_zip_is_zip64(mz_zip_archive *pZip)
  function mz_zip_get_central_dir_size (line 7570) | size_t mz_zip_get_central_dir_size(mz_zip_archive *pZip)
  function mz_uint (line 7578) | mz_uint mz_zip_reader_get_num_files(mz_zip_archive *pZip)
  function mz_uint64 (line 7583) | mz_uint64 mz_zip_get_archive_size(mz_zip_archive *pZip)
  function mz_uint64 (line 7590) | mz_uint64 mz_zip_get_archive_file_start_offset(mz_zip_archive *pZip)
  function MZ_FILE (line 7597) | MZ_FILE *mz_zip_get_cfile(mz_zip_archive *pZip)
  function mz_zip_read_archive_data (line 7604) | size_t mz_zip_read_archive_data(mz_zip_archive *pZip, mz_uint64 file_ofs...
  function mz_uint (line 7612) | mz_uint mz_zip_reader_get_filename(mz_zip_archive *pZip, mz_uint file_in...
  function mz_bool (line 7633) | mz_bool mz_zip_reader_file_stat(mz_zip_archive *pZip, mz_uint file_index...
  function mz_bool (line 7638) | mz_bool mz_zip_end(mz_zip_archive *pZip)

FILE: 3rdparty/miniz/miniz.h
  type mz_ulong (line 225) | typedef unsigned long mz_ulong;
  type mz_internal_state (line 306) | struct mz_internal_state
  type mz_stream (line 309) | typedef struct mz_stream_s
  type mz_stream (line 331) | typedef mz_stream *mz_streamp;
  type Byte (line 427) | typedef unsigned char Byte;
  type uInt (line 428) | typedef unsigned int uInt;
  type mz_ulong (line 429) | typedef mz_ulong uLong;
  type Byte (line 430) | typedef Byte Bytef;
  type uInt (line 431) | typedef uInt uIntf;
  type charf (line 432) | typedef char charf;
  type intf (line 433) | typedef int intf;
  type uLong (line 435) | typedef uLong uLongf;
  type mz_uint8 (line 512) | typedef unsigned char mz_uint8;
  type mz_int16 (line 513) | typedef signed short mz_int16;
  type mz_uint16 (line 514) | typedef unsigned short mz_uint16;
  type mz_uint32 (line 515) | typedef unsigned int mz_uint32;
  type mz_uint (line 516) | typedef unsigned int mz_uint;
  type mz_int64 (line 517) | typedef int64_t mz_int64;
  type mz_uint64 (line 518) | typedef uint64_t mz_uint64;
  type mz_bool (line 519) | typedef int mz_bool;
  type mz_dummy_time_t (line 539) | typedef struct mz_dummy_time_t_tag
  type mz_bool (line 666) | typedef mz_bool (*tdefl_put_buf_func_ptr)(const void *pBuf, int len, voi...
  type tdefl_status (line 709) | typedef enum {
  type tdefl_flush (line 717) | typedef enum {
  type tdefl_compressor (line 725) | typedef struct
  type tinfl_decompressor_tag (line 826) | struct tinfl_decompressor_tag
  type tinfl_decompressor (line 827) | typedef struct tinfl_decompressor_tag tinfl_decompressor;
  type tinfl_status (line 841) | typedef enum {
  type tinfl_huff_table (line 898) | typedef struct
  type mz_uint64 (line 911) | typedef mz_uint64 tinfl_bit_buf_t;
  type mz_uint32 (line 914) | typedef mz_uint32 tinfl_bit_buf_t;
  type tinfl_decompressor_tag (line 918) | struct tinfl_decompressor_tag
  type mz_zip_archive_file_stat (line 950) | typedef struct
  type mz_bool (line 1008) | typedef mz_bool (*mz_file_needs_keepalive)(void *pOpaque);
  type mz_zip_internal_state_tag (line 1010) | struct mz_zip_internal_state_tag
  type mz_zip_internal_state (line 1011) | typedef struct mz_zip_internal_state_tag mz_zip_internal_state;
  type mz_zip_mode (line 1013) | typedef enum {
  type mz_zip_flags (line 1020) | typedef enum {
  type mz_zip_type (line 1032) | typedef enum {
  type mz_zip_error (line 1043) | typedef enum {
  type mz_zip_archive (line 1079) | typedef struct
  type mz_zip_reader_extract_iter_state (line 1106) | typedef struct

FILE: 3rdparty/mshadow/guide/basic.cpp
  function main (line 27) | int main(void) {

FILE: 3rdparty/mshadow/guide/defop.cpp
  type addone (line 29) | struct addone {
    method MSHADOW_XINLINE (line 32) | MSHADOW_XINLINE static DType Map(DType a) {
  type maxoftwo (line 37) | struct maxoftwo {
    method MSHADOW_XINLINE (line 40) | MSHADOW_XINLINE static float Map(float a, float b) {
  function main (line 46) | int main(void) {

FILE: 3rdparty/mshadow/guide/mshadow-ps/dbstr.h
  function string (line 25) | string dbstr(mshadow::Tensor<mshadow::cpu, 1, DType> ts) {
  function string (line 34) | string dbstr(mshadow::Tensor<mshadow::cpu, 2, DType> ts) {
  function string (line 47) | string dbstr(mshadow::Tensor<mshadow::cpu, 3, DType> ts) {

FILE: 3rdparty/mshadow/guide/mshadow-ps/dist_async_sum-inl.h
  function namespace (line 34) | namespace mshadow {

FILE: 3rdparty/mshadow/guide/mshadow-ps/dist_async_sum.cpp
  function CreateServerNode (line 22) | int CreateServerNode(int argc, char *argv[]) {
  function WorkerNodeMain (line 28) | int WorkerNodeMain(int argc, char *argv[]) {

FILE: 3rdparty/mshadow/guide/mshadow-ps/local_sum-inl.h
  function Print_ (line 28) | void Print_(mshadow::Tensor<mshadow::cpu, 2, float> ts) {
  function namespace (line 93) | namespace mshadow {

FILE: 3rdparty/mshadow/guide/mshadow-ps/local_sum.cpp
  function main (line 21) | int main(int argc, char *argv[]) {

FILE: 3rdparty/mshadow/guide/neuralnet/util.h
  type real_t (line 26) | typedef float real_t;
  function pack (line 30) | int pack(unsigned char zz[4]){
  function LoadMNIST (line 51) | inline void LoadMNIST(const char *path_img, const char *path_label,

FILE: 3rdparty/mshadow/mshadow-ps/mshadow_ps.h
  function namespace (line 47) | namespace mshadow {
  function InvokeLambda_ (line 272) | inline static void InvokeLambda_(Stream<xpu> *stream, void *fun) {
  function virtual (line 289) | virtual void SetParam(const char *name, const char *val) {}
  function virtual (line 296) | virtual void InitUpdater(int rank, int argc, char *argv[]) {}
  function virtual (line 303) | virtual void InitModel(int key, DType *dptr, size_t size) {
  function virtual (line 312) | virtual void Update(int key, DType *dptr, size_t size) {
  function virtual (line 332) | virtual void Update_(int key, Tensor<cpu, 1, DType> data) {
  function namespace (line 349) | namespace mshadow {

FILE: 3rdparty/mshadow/mshadow-ps/ps_dist-inl.h
  function namespace (line 35) | namespace mshadow {

FILE: 3rdparty/mshadow/mshadow-ps/ps_local-inl.h
  type ms_omp_uint (line 35) | typedef int ms_omp_uint;
  type ms_omp_uint (line 37) | typedef unsigned ms_omp_uint;
  function namespace (line 44) | namespace mshadow {
  function Init (line 532) | inline void Init(int ndevice, Shape<2> shape,
  type PullEntry (line 580) | struct PullEntry {
  function PushHandlerGlobal (line 681) | inline void PushHandlerGlobal(void) {
  function PushHandlerLocal (line 694) | inline void PushHandlerLocal(size_t tid) {
  function MSHADOW_THREAD_PREFIX (line 705) | inline static MSHADOW_THREAD_PREFIX PushGlobalThread(void *pthread) {
  function MSHADOW_THREAD_PREFIX (line 710) | inline static MSHADOW_THREAD_PREFIX PushLocalThread(void *arg) {
  function PullProc (line 718) | inline void PullProc(utils::ThreadPQueue<std::pair<int, int> > *queue) {
  function PullHandlerGlobal (line 756) | inline void PullHandlerGlobal(void) {
  function PullHandlerLocal (line 769) | inline void PullHandlerLocal(size_t tid) {
  function MSHADOW_THREAD_PREFIX (line 780) | inline static MSHADOW_THREAD_PREFIX PullGlobalThread(void *arg) {
  function MSHADOW_THREAD_PREFIX (line 784) | inline static MSHADOW_THREAD_PREFIX PullLocalThread(void *arg) {
  function GetWorkIndex (line 792) | inline int GetWorkIndex(int devid) const {
  function InitPullMap (line 799) | inline void InitPullMap(int key) {
  function InitPushMap (line 817) | inline void InitPushMap(int key, Shape<2> shape) {

FILE: 3rdparty/mshadow/mshadow-ps/ps_rabit-inl.h
  function namespace (line 34) | namespace mshadow {

FILE: 3rdparty/mshadow/mshadow-ps/thread.h
  function namespace (line 33) | namespace mshadow {
  function ThreadExit (line 121) | inline void ThreadExit(void *status) {
  function namespace (line 132) | namespace mshadow {

FILE: 3rdparty/mshadow/mshadow-ps/thread_util.h
  function namespace (line 32) | namespace mshadow {
  function Destroy (line 146) | inline void Destroy(void) {
  function TValue (line 153) | inline TValue *Get(int key) {
  function TValue (line 166) | inline TValue &GetRef(int key) {
  function Init (line 171) | inline void Init(int key) {

FILE: 3rdparty/mshadow/mshadow/base.h
  type __int16 (line 52) | typedef __int16 int16_t;
  type __int32 (line 53) | typedef __int32 int32_t;
  type __int64 (line 54) | typedef __int64 int64_t;
  type index_t (line 326) | typedef int64_t index_t;
  type index_t (line 328) | typedef int32_t index_t;
  type openmp_index_t (line 333) | typedef int64_t openmp_index_t;
  type index_t (line 336) | typedef index_t openmp_index_t;
  type index_t (line 342) | typedef index_t lapack_index_t;
  type lapack_index_t (line 344) | typedef int lapack_index_t;
  type default_real_t (line 348) | typedef float default_real_t;
  type TypeFlag (line 351) | enum TypeFlag {
  function float (line 371) | struct DataType<float> {
  function double (line 385) | struct DataType<double> {
  function half_t (line 399) | struct DataType<half::half_t> {
  function bf16_t (line 413) | struct DataType<bfloat::bf16_t> {
  function uint8_t (line 418) | struct DataType<uint8_t> {
  function int8_t (line 433) | struct DataType<int8_t> {
  function int32_t (line 447) | struct DataType<int32_t> {
  function int64_t (line 461) | struct DataType<int64_t> {
  function bool (line 466) | struct DataType<bool> {
  function int16_t (line 471) | struct DataType<int16_t> {
  function uint16_t (line 476) | struct DataType<uint16_t> {
  function uint32_t (line 481) | struct DataType<uint32_t> {
  function uint64_t (line 486) | struct DataType<uint64_t> {
  type LayoutFlag (line 498) | enum LayoutFlag {
  function LayoutFlag (line 514) | inline LayoutFlag layoutFlag(std::string layoutstr) {
  function std (line 545) | inline std::string toString(LayoutFlag layout) {
  function kNCHW (line 576) | struct LayoutType<kNCHW> {
  function kNHWC (line 586) | struct LayoutType<kNHWC> {
  function kNCDHW (line 599) | struct LayoutType<kNCDHW> {
  function kNDHWC (line 609) | struct LayoutType<kNDHWC> {
  function namespace (line 622) | namespace op {
  function namespace (line 678) | namespace sv {
  type plusto (line 694) | struct plusto {
  function AlphaBLAS (line 701) | AlphaBLAS(void) { return 1.0f; }
  function default_real_t (line 703) | inline static default_real_t BetaBLAS(void) { return 1.0f; }
  type op (line 705) | typedef op::plus OPType;
  type minimum (line 1108) | struct minimum {
  type DType (line 1167) | typedef float DType;
  type DType (line 1173) | typedef double DType;
  type mshadow (line 1179) | typedef mshadow::half::half_t DType;
  type mshadow (line 1185) | typedef mshadow::bfloat::bf16_t DType;
  type DType (line 1191) | typedef uint8_t DType;
  type DType (line 1197) | typedef int8_t DType;
  type DType (line 1203) | typedef int32_t DType;
  type DType (line 1209) | typedef int64_t DType;
  function mshadow_sizeof (line 1804) | inline size_t mshadow_sizeof(int type) {
  function std (line 1811) | inline std::string dtype_string(const int dtype) {

FILE: 3rdparty/mshadow/mshadow/bfloat.h
  function namespace (line 31) | namespace mshadow {
  function MSHADOW_XINLINE (line 78) | static MSHADOW_XINLINE bf16_t Binary(uint16_t value) {
  function MSHADOW_XINLINE (line 84) | MSHADOW_XINLINE bf16_t() {}
  function MSHADOW_XINLINE (line 86) | MSHADOW_XINLINE bf16_t(const float& value) { constructor(value); }
  function MSHADOW_XINLINE (line 87) | MSHADOW_XINLINE explicit bf16_t(const double& value) { constructor(value...
  function MSHADOW_XINLINE (line 88) | MSHADOW_XINLINE explicit bf16_t(const int8_t& value) { constructor(value...
  function MSHADOW_XINLINE (line 89) | MSHADOW_XINLINE explicit bf16_t(const uint8_t& value) { constructor(valu...
  function MSHADOW_XINLINE (line 90) | MSHADOW_XINLINE explicit bf16_t(const int32_t& value) { constructor(valu...
  function MSHADOW_XINLINE (line 91) | MSHADOW_XINLINE explicit bf16_t(const uint32_t& value) { constructor(val...
  function MSHADOW_XINLINE (line 92) | MSHADOW_XINLINE explicit bf16_t(const int64_t& value) { constructor(valu...
  function MSHADOW_XINLINE (line 93) | MSHADOW_XINLINE explicit bf16_t(const uint64_t& value) { constructor(val...
  function MSHADOW_BF16_ASSIGNOP (line 95) | MSHADOW_BF16_CONVERSIONOP(float)

FILE: 3rdparty/mshadow/mshadow/dot_engine-inl.h
  function namespace (line 36) | namespace mshadow {

FILE: 3rdparty/mshadow/mshadow/expr_engine-inl.h
  function namespace (line 33) | namespace mshadow {
  function MSHADOW_XINLINE (line 105) | MSHADOW_XINLINE DType Eval(index_t y, index_t x) const {
  function explicit (line 117) | explicit Plan(const Plan<EType, SrcDType> &src) : src_(src) {}
  function MSHADOW_XINLINE (line 118) | MSHADOW_XINLINE DstDType Eval(index_t y, index_t x) const {
  function MSHADOW_XINLINE (line 133) | MSHADOW_XINLINE DType Eval(index_t y, index_t x) const {
  function explicit (line 146) | explicit Plan(const Plan<TA, DType> &lhs, const Plan<TB, DType> &rhs)
  function MSHADOW_XINLINE (line 148) | MSHADOW_XINLINE DType Eval(index_t y, index_t x) const {
  function explicit (line 160) | explicit Plan(const Plan<TA, DType> &src) : src_(src) {}
  function MSHADOW_XINLINE (line 161) | MSHADOW_XINLINE DType Eval(index_t y, index_t x) const {
  function src_ (line 172) | src_(src) {}
  function MSHADOW_XINLINE (line 173) | MSHADOW_XINLINE DType Eval(index_t y, index_t x) const {
  function MSHADOW_XINLINE (line 185) | MSHADOW_XINLINE DType Eval(index_t y, index_t x) const {
  type TypeCheckPass (line 333) | struct TypeCheckPass
  type TypeCheckPass (line 335) | struct TypeCheckPass
  function Error_All_Tensor_in_Exp_Must_Have_Same_Type (line 336) | inline static void Error_All_Tensor_in_Exp_Must_Have_Same_Type(void) {}
  function Error_TypeCheck_Not_Pass_For_Reduce_Exp (line 337) | inline static void Error_TypeCheck_Not_Pass_For_Reduce_Exp(void) {}
  function Error_Expression_Does_Not_Meet_Dimension_Req (line 338) | inline static void Error_Expression_Does_Not_Meet_Dimension_Req(void) {}
  function namespace (line 451) | namespace mshadow {

FILE: 3rdparty/mshadow/mshadow/expr_scalar-inl.h
  function namespace (line 34) | namespace mshadow {

FILE: 3rdparty/mshadow/mshadow/expression.h
  function namespace (line 29) | namespace mshadow {
  function explicit (line 136) | explicit TransposeExp(const EType &e) : exp(e) {}
  function EType (line 138) | inline const EType &T(void) const {
  function Container (line 178) | inline Container &__assign(DType s) {
  function explicit (line 341) | explicit BinaryMapExp(const TA &lhs, const TB &rhs)
  function explicit (line 409) | explicit UnaryMapExp(const TA &src) : src_(src) {}

FILE: 3rdparty/mshadow/mshadow/extension/broadcast.h
  function namespace (line 28) | namespace mshadow {
  function MSHADOW_XINLINE (line 145) | MSHADOW_XINLINE DType Eval(index_t y, index_t x) const {
  function explicit (line 158) | explicit Plan(const Broadcast1DExp<SrcExp, DType, dimdst, 1> &e)
  function MSHADOW_XINLINE (line 160) | MSHADOW_XINLINE DType Eval(index_t y, index_t x) const {
  function explicit (line 172) | explicit Plan(const BroadcastScalarExp<SrcExp, DType, dimdst> &e)
  function MSHADOW_XINLINE (line 174) | MSHADOW_XINLINE DType Eval(index_t y, index_t x) const {

FILE: 3rdparty/mshadow/mshadow/extension/broadcast_with_axis.h
  function namespace (line 31) | namespace mshadow {
  function src_ (line 148) | src_(src) {
  function MSHADOW_XINLINE (line 240) | MSHADOW_XINLINE DType Eval(index_t i, index_t j) const {

FILE: 3rdparty/mshadow/mshadow/extension/channel_pool.h
  function namespace (line 29) | namespace mshadow {

FILE: 3rdparty/mshadow/mshadow/extension/channel_unpool.h
  function namespace (line 29) | namespace mshadow {

FILE: 3rdparty/mshadow/mshadow/extension/choose.h
  function namespace (line 30) | namespace mshadow {

FILE: 3rdparty/mshadow/mshadow/extension/complex.h
  function namespace (line 30) | namespace mshadow {
  type exchange (line 76) | struct exchange {
  type pad_imag (line 90) | struct pad_imag {
  type toreal (line 104) | struct toreal {
  type abs_square (line 113) | struct abs_square {
  type sum_real_imag (line 123) | struct sum_real_imag {
  function namespace (line 135) | namespace expr {
  function explicit (line 173) | explicit ComplexUnitaryExp(const TA &src) : src_(src) {}
  function explicit (line 390) | explicit Plan(const Plan<TA, DType> &lhs, const Plan<TB, DType> &rhs)
  function MSHADOW_XINLINE (line 392) | MSHADOW_XINLINE DType Eval(index_t y, index_t x) const {
  function explicit (line 412) | explicit Plan(const Plan<TA, DType> &lhs, const Plan<TB, DType> &rhs)
  function MSHADOW_XINLINE (line 414) | MSHADOW_XINLINE DType Eval(index_t y, index_t x) const {
  function explicit (line 435) | explicit Plan(const Plan<TA, DType> &lhs, const Plan<TB, DType> &rhs)
  function MSHADOW_XINLINE (line 437) | MSHADOW_XINLINE DType Eval(index_t y, index_t x) const {
  function MSHADOW_XINLINE (line 459) | MSHADOW_XINLINE DType Eval(index_t y, index_t x) const {
  function MSHADOW_XINLINE (line 477) | MSHADOW_XINLINE DType Eval(index_t y, index_t x) const {
  function MSHADOW_XINLINE (line 498) | MSHADOW_XINLINE DType Eval(index_t y, index_t x) const {

FILE: 3rdparty/mshadow/mshadow/extension/concat.h
  function namespace (line 29) | namespace mshadow {
  function MSHADOW_XINLINE (line 162) | MSHADOW_XINLINE DType &REval(index_t i, index_t j) {
  function MSHADOW_XINLINE (line 190) | MSHADOW_XINLINE DType Eval(index_t y, index_t x) const {
  function MSHADOW_XINLINE (line 197) | MSHADOW_XINLINE DType &REval(index_t y, index_t x) {

FILE: 3rdparty/mshadow/mshadow/extension/crop.h
  function namespace (line 28) | namespace mshadow {
  function MSHADOW_XINLINE (line 121) | MSHADOW_XINLINE DType Eval(index_t i, index_t j) const {

FILE: 3rdparty/mshadow/mshadow/extension/fill.h
  function namespace (line 31) | namespace mshadow {

FILE: 3rdparty/mshadow/mshadow/extension/flip.h
  function namespace (line 30) | namespace mshadow {
  function MSHADOW_XINLINE (line 125) | MSHADOW_XINLINE DType Eval(index_t i, index_t j) const {
  function MSHADOW_XINLINE (line 134) | MSHADOW_XINLINE DType &REval(index_t i, index_t j) const {

FILE: 3rdparty/mshadow/mshadow/extension/implicit_gemm.h
  function namespace (line 31) | namespace mshadow {

FILE: 3rdparty/mshadow/mshadow/extension/mask.h
  function namespace (line 30) | namespace mshadow {

FILE: 3rdparty/mshadow/mshadow/extension/mirror.h
  function namespace (line 28) | namespace mshadow {
  function explicit (line 68) | explicit Plan(const MirroringExp<SrcExp, DType, srcdim> &e)
  function MSHADOW_XINLINE (line 70) | MSHADOW_XINLINE DType Eval(index_t i, index_t j) const {

FILE: 3rdparty/mshadow/mshadow/extension/one_hot.h
  function namespace (line 31) | namespace mshadow {

FILE: 3rdparty/mshadow/mshadow/extension/pack_col2patch.h
  function namespace (line 29) | namespace mshadow {

FILE: 3rdparty/mshadow/mshadow/extension/pad.h
  function namespace (line 28) | namespace mshadow {

FILE: 3rdparty/mshadow/mshadow/extension/range.h
  function namespace (line 30) | namespace mshadow {

FILE: 3rdparty/mshadow/mshadow/extension/reduce_with_axis.h
  function namespace (line 30) | namespace mshadow {
  function MSHADOW_XINLINE (line 122) | MSHADOW_XINLINE DType Eval(index_t i, index_t j) const {

FILE: 3rdparty/mshadow/mshadow/extension/reduceto1d.h
  function namespace (line 28) | namespace mshadow {
  function Eval (line 102) | inline static void Eval(Tensor<Device, 1, DType> *dst,

FILE: 3rdparty/mshadow/mshadow/extension/reshape.h
  function namespace (line 28) | namespace mshadow {
  function MSHADOW_XINLINE (line 79) | MSHADOW_XINLINE DType Eval(index_t y, index_t x) const {
  function explicit (line 92) | explicit Plan(const ReshapeExp<SrcExp, DType, dimdst, 1> &e)
  function MSHADOW_XINLINE (line 95) | MSHADOW_XINLINE DType Eval(index_t y, index_t x) const {

FILE: 3rdparty/mshadow/mshadow/extension/slice.h
  function namespace (line 29) | namespace mshadow {
  function MSHADOW_XINLINE (line 139) | MSHADOW_XINLINE DType &REval(index_t i, index_t j) {
  function explicit (line 158) | explicit Plan(const SliceExp<SrcExp, Device, DType, srcdim, 1> &e)
  function MSHADOW_XINLINE (line 161) | MSHADOW_XINLINE DType Eval(index_t y, index_t x) const {
  function MSHADOW_XINLINE (line 164) | MSHADOW_XINLINE DType &REval(index_t y, index_t x) {

FILE: 3rdparty/mshadow/mshadow/extension/slice_ex.h
  function namespace (line 29) | namespace mshadow {
  function MSHADOW_XINLINE (line 124) | MSHADOW_XINLINE DType Eval(index_t i, index_t j) const {
  function MSHADOW_XINLINE (line 135) | MSHADOW_XINLINE DType &REval(index_t i, index_t j) {

FILE: 3rdparty/mshadow/mshadow/extension/spatial_pool.h
  function namespace (line 29) | namespace mshadow {

FILE: 3rdparty/mshadow/mshadow/extension/spatial_unpool.h
  function namespace (line 29) | namespace mshadow {

FILE: 3rdparty/mshadow/mshadow/extension/spatial_upsampling_nearest.h
  function namespace (line 29) | namespace mshadow {

FILE: 3rdparty/mshadow/mshadow/extension/swapaxis.h
  function namespace (line 30) | namespace mshadow {
  function MSHADOW_XINLINE (line 113) | MSHADOW_XINLINE DType Eval(index_t i, index_t x) const {

FILE: 3rdparty/mshadow/mshadow/extension/take.h
  function namespace (line 30) | namespace mshadow {

FILE: 3rdparty/mshadow/mshadow/extension/take_grad.h
  function namespace (line 30) | namespace mshadow {

FILE: 3rdparty/mshadow/mshadow/extension/transpose.h
  function namespace (line 29) | namespace mshadow {
  function MSHADOW_XINLINE (line 88) | MSHADOW_XINLINE DType Eval(index_t i, index_t j) const {
  function MSHADOW_XINLINE (line 176) | MSHADOW_XINLINE DType Eval(index_t i, index_t j) const {

FILE: 3rdparty/mshadow/mshadow/extension/unpack_patch2col.h
  function namespace (line 28) | namespace mshadow {

FILE: 3rdparty/mshadow/mshadow/half.h
  function MSHADOW_XINLINE (line 45) | MSHADOW_XINLINE float __half2float_warp(const volatile __half& h) { /* N...
  function namespace (line 60) | namespace mshadow {
  function half_t (line 82) | half_t operator AOP (const volatile T& a) volatile {    \
  function MSHADOW_XINLINE (line 121) | static MSHADOW_XINLINE half_t Binary(uint16_t value) {
  function MSHADOW_XINLINE (line 127) | MSHADOW_XINLINE half_t() {}
  function MSHADOW_XINLINE (line 130) | MSHADOW_XINLINE explicit half_t(const __half& value) {
  function MSHADOW_XINLINE (line 135) | MSHADOW_XINLINE half_t(const float& value) { constructor(value); }
  function MSHADOW_XINLINE (line 136) | MSHADOW_XINLINE explicit half_t(const double& value) { constructor(value...
  function MSHADOW_XINLINE (line 137) | MSHADOW_XINLINE explicit half_t(const int8_t& value) { constructor(value...
  function MSHADOW_XINLINE (line 138) | MSHADOW_XINLINE explicit half_t(const uint8_t& value) { constructor(valu...
  function MSHADOW_XINLINE (line 139) | MSHADOW_XINLINE explicit half_t(const int32_t& value) { constructor(valu...
  function MSHADOW_XINLINE (line 140) | MSHADOW_XINLINE explicit half_t(const uint32_t& value) { constructor(val...
  function MSHADOW_XINLINE (line 141) | MSHADOW_XINLINE explicit half_t(const int64_t& value) { constructor(valu...
  function MSHADOW_XINLINE (line 142) | MSHADOW_XINLINE explicit half_t(const uint64_t& value) { constructor(val...
  function MSHADOW_HALF_ASSIGNOP (line 144) | MSHADOW_HALF_CONVERSIONOP(float)
  function MSHADOW_XINLINE (line 215) | MSHADOW_XINLINE uint16_t float2half(const float& value) const {
  function MSHADOW_XINLINE (line 263) | MSHADOW_XINLINE uint16_t float2half(const volatile float& value) const v...
  function MSHADOW_XINLINE (line 303) | MSHADOW_XINLINE float half2float(const uint16_t& value) const {
  function MSHADOW_XINLINE (line 321) | MSHADOW_XINLINE float half2float(const volatile uint16_t& value) const v...
  function constructor (line 340) | void constructor(const T& value) {

FILE: 3rdparty/mshadow/mshadow/io.h
  function namespace (line 29) | namespace mshadow {

FILE: 3rdparty/mshadow/mshadow/packet-inl.h
  function namespace (line 37) | namespace mshadow {
  function namespace (line 69) | namespace mshadow {
  function MSHADOW_CINLINE (line 208) | MSHADOW_CINLINE static void Save(TFloat *dst, const Packet<TFloat, Arch>...
  function MSHADOW_CINLINE (line 216) | MSHADOW_CINLINE static void Save(TFloat *dst, const Packet<TFloat, Arch>...
  function namespace (line 228) | namespace mshadow {
  function explicit (line 265) | explicit PacketPlan(DType scalar) : scalar_(scalar) {}
  function MSHADOW_CINLINE (line 269) | MSHADOW_CINLINE DType Eval(index_t y, index_t x) const {
  function MSHADOW_CINLINE (line 285) | MSHADOW_CINLINE DType Eval(index_t y, index_t x) const {
  function src_ (line 297) | src_(src) {}
  function MSHADOW_CINLINE (line 298) | MSHADOW_CINLINE packet::Packet<DType> EvalPacket(index_t y, index_t x) c...
  function MSHADOW_CINLINE (line 301) | MSHADOW_CINLINE DType Eval(index_t y, index_t x) const {
  function Check (line 380) | inline static bool Check(const E &exp) {

FILE: 3rdparty/mshadow/mshadow/packet/plain-inl.h
  function namespace (line 30) | namespace mshadow {
  function MSHADOW_CINLINE (line 61) | MSHADOW_CINLINE void Store(DType* dst) const {

FILE: 3rdparty/mshadow/mshadow/packet/sse-inl.h
  function namespace (line 32) | namespace mshadow {
  function MSHADOW_CINLINE (line 63) | MSHADOW_CINLINE void Store(float* dst) const {
  function MSHADOW_CINLINE (line 67) | MSHADOW_CINLINE float Sum() const {
  function LoadUnAligned (line 98) | double, kSSE2> LoadUnAligned(const double* src) {
  function MSHADOW_CINLINE (line 107) | MSHADOW_CINLINE void Store(double* dst) const {
  function Sum (line 111) | inline double Sum(void) const {

FILE: 3rdparty/mshadow/mshadow/random.h
  function namespace (line 38) | namespace mshadow {

FILE: 3rdparty/mshadow/mshadow/stream_gpu-inl.h
  function namespace (line 32) | namespace mshadow {

FILE: 3rdparty/mshadow/mshadow/tensor.h
  function namespace (line 37) | namespace mshadow {
  function MSHADOW_XINLINE (line 220) | MSHADOW_XINLINE Shape<1> Shape1(index_t s0) {
  function MSHADOW_XINLINE (line 230) | MSHADOW_XINLINE Shape<2> Shape2(index_t s0, index_t s1) {
  function MSHADOW_XINLINE (line 241) | MSHADOW_XINLINE Shape<3> Shape3(index_t s0, index_t s1, index_t s2) {
  function MSHADOW_XINLINE (line 254) | MSHADOW_XINLINE Shape<4> Shape4(index_t s0, index_t s1,
  function MSHADOW_XINLINE (line 269) | MSHADOW_XINLINE Shape<5> Shape5(index_t s0, index_t s1, index_t s2,
  function Shape (line 283) | inline Shape<3> ConvertLayout(const Shape<3>& src, int src_layout, int d...
  function Shape (line 320) | inline Shape<4> ConvertLayout(const Shape<4>& src, int src_layout, int d...
  function Shape (line 360) | inline Shape<5> ConvertLayout(const Shape<5>& src, int src_layout, int d...
  function Wait (line 495) | inline void Wait(void) {}
  function CheckIdle (line 500) | inline bool CheckIdle(void) {
  function CreateBlasHandle (line 504) | inline void CreateBlasHandle() {}
  function MSHADOW_XINLINE (line 556) | MSHADOW_XINLINE Tensor(void) : stream_(NULL) {}
  function MSHADOW_XINLINE (line 558) | MSHADOW_XINLINE Tensor(const Shape<dimension> &shape)
  function index_t (line 584) | index_t MemSize(void) const {
  function MSHADOW_XINLINE (line 596) | MSHADOW_XINLINE bool CheckContiguous(void) const {
  function MSHADOW_XINLINE (line 602) | MSHADOW_XINLINE index_t MSize(void) const {
  function MSHADOW_XINLINE (line 610) | MSHADOW_XINLINE index_t size(int idx) const {
  function MSHADOW_XINLINE (line 681) | MSHADOW_XINLINE Tensor(void) : stream_(NULL) {}
  function MSHADOW_XINLINE (line 682) | MSHADOW_XINLINE Tensor(const Shape<1> &shape)
  function MSHADOW_XINLINE (line 705) | MSHADOW_XINLINE bool CheckContiguous(void) const {
  function MSHADOW_XINLINE (line 708) | MSHADOW_XINLINE index_t MSize(void) const {
  function MSHADOW_XINLINE (line 711) | MSHADOW_XINLINE index_t size(index_t i) const {
  function MSHADOW_XINLINE (line 714) | MSHADOW_XINLINE DType &operator[](index_t idx) {
  function MSHADOW_XINLINE (line 717) | MSHADOW_XINLINE const DType &operator[](index_t idx) const {

FILE: 3rdparty/mshadow/mshadow/tensor_container.h
  function namespace (line 30) | namespace mshadow {
  function Resize (line 99) | inline void Resize(const Shape<dimension> &shape) {
  function Resize (line 117) | inline void Resize(const Shape<dimension> &shape, DType initv) {
  function set_pad (line 122) | inline void set_pad(bool pad) {
  function Release (line 190) | inline void Release(void) {
  function AllocByShape (line 212) | inline void AllocByShape(const Shape<dimension>& shape) {

FILE: 3rdparty/mshadow/mshadow/tensor_cpu-inl.h
  function namespace (line 36) | namespace mshadow {
  function size (line 78) | size_t size) {
  function size (line 90) | size_t size) {
  type Shape (line 260) | typedef Shape<expr::ExpInfo<E>::kDim> EShape;

FILE: 3rdparty/mshadow/mshadow/tensor_gpu-inl.h
  function namespace (line 30) | namespace mshadow {
  function namespace (line 118) | namespace mshadow {
  type Shape (line 160) | typedef Shape<expr::ExpInfo<E>::kDim> EShape;
  function alpha (line 215) | float alpha) {

FILE: benchmark/opperf/custom_operations/custom_operations.py
  class CustomAddOne (line 29) | class CustomAddOne(mx.operator.CustomOp):
    method forward (line 30) | def forward(self, is_train, req, in_data, out_data, aux):
    method backward (line 33) | def backward(self, req, out_grad, in_data, out_data, in_grad, aux):
  class CustomAddOneProp (line 38) | class CustomAddOneProp(mx.operator.CustomOpProp):
    method __init__ (line 39) | def __init__(self):
    method list_arguments (line 42) | def list_arguments(self):
    method list_outputs (line 45) | def list_outputs(self):
    method infer_shape (line 48) | def infer_shape(self, in_shape):
    method create_operator (line 52) | def create_operator(self, ctx, shapes, dtypes):

FILE: benchmark/opperf/nd_operations/array_manipulation_operators.py
  function run_rearrange_operators_benchmarks (line 72) | def run_rearrange_operators_benchmarks(ctx=mx.cpu(), dtype='float32', pr...
  function run_shape_operators_benchmarks (line 104) | def run_shape_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profil...
  function run_expanding_operators_benchmarks (line 136) | def run_expanding_operators_benchmarks(ctx=mx.cpu(), dtype='float32', pr...
  function run_rounding_operators_benchmarks (line 168) | def run_rounding_operators_benchmarks(ctx=mx.cpu(), dtype='float32', pro...
  function run_join_split_operators_benchmarks (line 200) | def run_join_split_operators_benchmarks(ctx=mx.cpu(), dtype='float32', p...

FILE: benchmark/opperf/nd_operations/array_rearrange.py
  function run_rearrange_operators_benchmarks (line 32) | def run_rearrange_operators_benchmarks(ctx=mx.cpu(), dtype='float32', pr...

FILE: benchmark/opperf/nd_operations/binary_operators.py
  function run_mx_binary_misc_operators_benchmarks (line 41) | def run_mx_binary_misc_operators_benchmarks(ctx=mx.cpu(), dtype='float32...
  function run_mx_binary_broadcast_operators_benchmarks (line 72) | def run_mx_binary_broadcast_operators_benchmarks(ctx=mx.cpu(), dtype='fl...
  function run_mx_binary_element_wise_operators_benchmarks (line 103) | def run_mx_binary_element_wise_operators_benchmarks(ctx=mx.cpu(), dtype=...

FILE: benchmark/opperf/nd_operations/gemm_operators.py
  function run_gemm_operators_benchmarks (line 38) | def run_gemm_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profile...

FILE: benchmark/opperf/nd_operations/indexing_routines.py
  function run_indexing_routines_benchmarks (line 38) | def run_indexing_routines_benchmarks(ctx=mx.cpu(), dtype='float32', prof...

FILE: benchmark/opperf/nd_operations/linalg_operators.py
  function run_linalg_operators_benchmarks (line 37) | def run_linalg_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profi...

FILE: benchmark/opperf/nd_operations/misc_operators.py
  function run_mx_misc_operators_benchmarks (line 40) | def run_mx_misc_operators_benchmarks(ctx=mx.cpu(), dtype='float32', prof...

FILE: benchmark/opperf/nd_operations/nn_activation_operators.py
  function run_activation_operators_benchmarks (line 48) | def run_activation_operators_benchmarks(ctx=mx.cpu(), dtype='float32', p...

FILE: benchmark/opperf/nd_operations/nn_basic_operators.py
  function run_nn_basic_operators_benchmarks (line 47) | def run_nn_basic_operators_benchmarks(ctx=mx.cpu(), dtype='float32', pro...

FILE: benchmark/opperf/nd_operations/nn_conv_operators.py
  function run_pooling_operators_benchmarks (line 55) | def run_pooling_operators_benchmarks(ctx=mx.cpu(), dtype='float32', prof...
  function run_convolution_operators_benchmarks (line 153) | def run_convolution_operators_benchmarks(ctx=mx.cpu(), dtype='float32', ...
  function run_transpose_convolution_operators_benchmarks (line 249) | def run_transpose_convolution_operators_benchmarks(ctx=mx.cpu(), profile...

FILE: benchmark/opperf/nd_operations/nn_loss_operators.py
  function run_loss_operators_benchmarks (line 31) | def run_loss_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profile...

FILE: benchmark/opperf/nd_operations/nn_optimizer_operators.py
  function run_optimizer_operators_benchmarks (line 57) | def run_optimizer_operators_benchmarks(ctx=mx.cpu(), dtype='float32', pr...

FILE: benchmark/opperf/nd_operations/random_sampling_operators.py
  function run_mx_random_sampling_operators_benchmarks (line 37) | def run_mx_random_sampling_operators_benchmarks(ctx=mx.cpu(), dtype='flo...

FILE: benchmark/opperf/nd_operations/reduction_operators.py
  function run_mx_reduction_operators_benchmarks (line 34) | def run_mx_reduction_operators_benchmarks(ctx=mx.cpu(), dtype='float32',...

FILE: benchmark/opperf/nd_operations/sorting_searching_operators.py
  function run_sorting_searching_operators_benchmarks (line 32) | def run_sorting_searching_operators_benchmarks(ctx=mx.cpu(), dtype='floa...

FILE: benchmark/opperf/nd_operations/unary_operators.py
  function run_mx_unary_operators_benchmarks (line 41) | def run_mx_unary_operators_benchmarks(ctx=mx.cpu(), dtype='float32', pro...

FILE: benchmark/opperf/opperf.py
  function run_all_mxnet_operator_benchmarks (line 56) | def run_all_mxnet_operator_benchmarks(ctx=mx.cpu(), dtype='float32', pro...
  function _parse_mxnet_context (line 146) | def _parse_mxnet_context(ctx):
  function main (line 157) | def main():

FILE: benchmark/opperf/utils/benchmark_operators_pytest.py
  function generate_test_cases (line 56) | def generate_test_cases():
  function generate_test_ids (line 63) | def generate_test_ids():
  function test (line 105) | def test(op_name, shape, params):

FILE: benchmark/opperf/utils/benchmark_utils.py
  function _prepare_op_inputs (line 33) | def _prepare_op_inputs(inputs, run_backward, dtype, ctx, module):
  function get_mx_np_ndarray (line 56) | def get_mx_np_ndarray(ctx, in_tensor, dtype, initializer, attach_grad=Tr...
  function adjust_op_name (line 104) | def adjust_op_name(module, name):
  function parse_input_ndarray (line 123) | def parse_input_ndarray(input_dict):
  function _run_operator_performance_test (line 173) | def _run_operator_performance_test(op, inputs, run_backward, warmup, run...
  function run_performance_test (line 206) | def run_performance_test(ops, inputs, run_backward=True,
  function run_benchmark_operator (line 257) | def run_benchmark_operator(name, size = (128,128), additional_inputs = {},
  function run_op_benchmarks (line 283) | def run_op_benchmarks(ops, dtype, ctx, profiler, int64_tensor, warmup, r...

FILE: benchmark/opperf/utils/common_utils.py
  function merge_map_list (line 26) | def merge_map_list(map_list):
  function save_to_file (line 52) | def save_to_file(inp_dict, out_filepath, out_format='json', runtime_feat...
  function get_json (line 82) | def get_json(inp_dict):
  function _prepare_op_benchmark_result (line 98) | def _prepare_op_benchmark_result(op, op_bench_result, profiler):
  function _prepare_markdown (line 135) | def _prepare_markdown(results, runtime_features=None, profiler='native'):

FILE: benchmark/opperf/utils/ndarray_utils.py
  function nd_forward_backward_and_profile (line 23) | def nd_forward_backward_and_profile(op, runs, **kwargs):
  function nd_forward_and_profile (line 64) | def nd_forward_and_profile(op, runs, **kwargs):
  function get_mx_ndarray (line 102) | def get_mx_ndarray(ctx, in_tensor, dtype, initializer, attach_grad=True):

FILE: benchmark/opperf/utils/op_registry_utils.py
  function _select_ops (line 26) | def _select_ops(operator_names, filters=("_contrib", "_"), merge_op_forw...
  function _set_op_arguments (line 80) | def _set_op_arguments(mx_operators):
  function _get_all_mxnet_operators (line 90) | def _get_all_mxnet_operators():
  function prepare_op_inputs (line 100) | def prepare_op_inputs(arg_params, arg_values):
  function prepare_op_inputs (line 112) | def prepare_op_inputs(op, arg_params, int64_tensor):
  function get_all_unary_operators (line 214) | def get_all_unary_operators():
  function get_all_broadcast_binary_operators (line 237) | def get_all_broadcast_binary_operators():
  function get_all_misc_binary_operators (line 257) | def get_all_misc_binary_operators():
  function get_all_elemen_wise_binary_operators (line 277) | def get_all_elemen_wise_binary_operators():
  function get_all_random_sampling_operators (line 299) | def get_all_random_sampling_operators():
  function get_all_linalg_operators (line 320) | def get_all_linalg_operators():
  function get_all_reduction_operators (line 343) | def get_all_reduction_operators():
  function get_all_nn_basic_operators (line 362) | def get_all_nn_basic_operators():
  function get_all_nn_activation_operators (line 384) | def get_all_nn_activation_operators():
  function get_all_optimizer_operators (line 404) | def get_all_optimizer_operators():
  function get_all_sorting_searching_operators (line 426) | def get_all_sorting_searching_operators():
  function get_all_rearrange_operators (line 446) | def get_all_rearrange_operators():
  function get_remaining_miscellaneous_operators (line 467) | def get_remaining_miscellaneous_operators():
  function get_all_indexing_routines (line 486) | def get_all_indexing_routines():
  function get_all_loss_operators (line 512) | def get_all_loss_operators():
  function get_all_shape_operators (line 532) | def get_all_shape_operators():
  function get_all_expanding_operators (line 553) | def get_all_expanding_operators():
  function get_all_rounding_operators (line 574) | def get_all_rounding_operators():
  function get_operators_with_no_benchmark (line 595) | def get_operators_with_no_benchmark(operators_with_benchmark):
  function get_current_runtime_features (line 615) | def get_current_runtime_features():

FILE: benchmark/opperf/utils/profiler_utils.py
  function _get_memory_profile (line 34) | def _get_memory_profile(memory_profile_results):
  function _get_operator_profile (line 45) | def _get_operator_profile(operator_name, operator_profile_results):
  function parse_profiler_dump (line 84) | def parse_profiler_dump(operator_name, profiler_dump):
  function cpp_profile (line 171) | def cpp_profile(func):
  function python_profile (line 223) | def python_profile(func):

FILE: benchmark/python/control_flow/rnn.py
  class ForeachRNN (line 39) | class ForeachRNN(gluon.HybridBlock):
    method __init__ (line 40) | def __init__(self, cell, length):
    method forward (line 45) | def forward(self, inputs, states):
  class WhileRNN (line 50) | class WhileRNN(gluon.HybridBlock):
    method __init__ (line 51) | def __init__(self, cell, length):
    method forward (line 56) | def forward(self, inputs, states):
  function _zeros (line 73) | def _zeros(shape, ctx):
  function _array (line 77) | def _array(shape, ctx):
  function _get_gpus (line 81) | def _get_gpus():
  function run_benchmark (line 84) | def run_benchmark(cell_type, ctx, seq_len, batch_size, hidden_dim):
  function main (line 119) | def main():

FILE: benchmark/python/dnnl/fc_add.py
  function dump_graph_fn (line 42) | def dump_graph_fn(net, postfix):
  function operator_string (line 46) | def operator_string(elemwise_add):
  function print_header (line 49) | def print_header(header):
  function print_value (line 60) | def print_value(shape, hidden, mean):
  function measure (line 67) | def measure(net, data0, data1, data2, shape, nhid):
  class FCWithSum (line 82) | class FCWithSum(nn.HybridBlock):
    method __init__ (line 83) | def __init__(self, num_in, num_hidden, elemwise_add, **kwargs):
    method forward (line 89) | def forward(self, data0, data1, data2):
  function benchmark_float (line 100) | def benchmark_float(elemwise_add, broadcast=False):
  class CalibIter (line 118) | class CalibIter(mx.io.DataIter):
    method __init__ (line 119) | def __init__(self, batch, data_shape, batch_size):
    method __iter__ (line 129) | def __iter__(self):
  function benchmark_int8 (line 132) | def benchmark_int8(quantize_mode, quantize_granularity, elemwise_add, br...

FILE: benchmark/python/einsum/benchmark_einsum.py
  function measure_cost (line 22) | def measure_cost(repeat, func_name, *args, **kwargs):
  function test_np_einsum (line 35) | def test_np_einsum():

FILE: benchmark/python/ffi/benchmark_ffi.py
  class OpArgMngr (line 22) | class OpArgMngr(object):
    method add_workload (line 27) | def add_workload(funcname, *args, **kwargs):
  function generate_workloads (line 38) | def generate_workloads():
  function prepare_workloads (line 51) | def prepare_workloads():
  function benchmark_helper (line 215) | def benchmark_helper(f, *args, **kwargs):
  function get_op (line 220) | def get_op(module, funcname):
  function run_benchmark (line 227) | def run_benchmark(packages):
  function show_results (line 242) | def show_results(results):

FILE: benchmark/python/metric/benchmark_metric.py
  class MetricDataGen (line 26) | class MetricDataGen(object):
    method __init__ (line 28) | def __init__(self, n, c, pred_ctx, label_ctx):
    method data (line 34) | def data(self):
    method batch_size (line 41) | def batch_size(self):
    method output_dim (line 45) | def output_dim(self):
  class F1MetricDataGen (line 49) | class F1MetricDataGen(MetricDataGen):
    method __init__ (line 51) | def __init__(self, n, c, pred_ctx, label_ctx):
  class PearsonMetricDataGen (line 55) | class PearsonMetricDataGen(MetricDataGen):
    method __init__ (line 57) | def __init__(self, n, c, pred_ctx, label_ctx):
    method data (line 60) | def data(self):
  function run_metric (line 67) | def run_metric(name, data_gen_cls, i, n, c, pred_ctx, label_ctx, **kwargs):
  function test_metric_performance (line 84) | def test_metric_performance():

FILE: benchmark/python/quantization/benchmark_op.py
  function quantize_int8_helper (line 23) | def quantize_int8_helper(data):
  function benchmark_convolution (line 29) | def benchmark_convolution(data_shape, kernel, num_filter, pad, stride, n...

FILE: benchmark/python/sparse/cast_storage.py
  function measure_cost (line 32) | def measure_cost(repeat, f, *args, **kwargs):
  function run_cast_storage_synthetic (line 42) | def run_cast_storage_synthetic():

FILE: benchmark/python/sparse/dot.py
  function measure_cost (line 110) | def measure_cost(repeat, scipy_trans_lhs, scipy_dns_lhs, func_name, *arg...
  function _get_iter (line 128) | def _get_iter(path, data_shape, batch_size):
  function _line_count (line 136) | def _line_count(path):
  function _compare_sparse_dense (line 140) | def _compare_sparse_dense(data_dir, file_name, mini_file_name, feature_dim,
  function test_dot_real (line 199) | def test_dot_real(data_dict):
  function test_dot_synthetic (line 259) | def test_dot_synthetic(data_dict):

FILE: benchmark/python/sparse/memory_benchmark.py
  function parse_args (line 28) | def parse_args():
  function main (line 62) | def main():
  function bench_dot (line 79) | def bench_dot(lhs_row_dim, lhs_col_dim, rhs_col_dim, density,

FILE: benchmark/python/sparse/sparse_op.py
  function measure_cost (line 58) | def measure_cost(repeat, f, *args, **kwargs):
  function test_dot_real (line 71) | def test_dot_real(data_dict):
  function test_dot_synthetic (line 136) | def test_dot_synthetic():

FILE: benchmark/python/sparse/util.py
  function estimate_density (line 21) | def estimate_density(DATA_PATH, feature_size):

FILE: benchmark/python/tvmop/benchmark_tvmop.py
  function measure_cost (line 23) | def measure_cost(repeat, func_name, *args, **kwargs):
  function test_tvm_dot (line 36) | def test_tvm_dot():

FILE: cd/python/pypi/pypi_publish.py
  function post_wheel (line 31) | def post_wheel(path):
  function get_secret (line 60) | def get_secret():

FILE: cd/utils/artifact_repository.py
  function config_logging (line 47) | def config_logging():
  function s3_upload (line 56) | def s3_upload(bucket: str, s3_key_prefix: str, paths: List[str]):
  function write_libmxnet_meta (line 71) | def write_libmxnet_meta(args: argparse.Namespace, destination: str):
  function try_s3_download (line 87) | def try_s3_download(bucket, s3_key_prefix, destination) -> bool:
  function get_commit_id_from_cmd (line 128) | def get_commit_id_from_cmd() -> Optional[str]:
  function probe_commit_id (line 144) | def probe_commit_id() -> str:
  function get_linux_os_release_properties (line 164) | def get_linux_os_release_properties() -> Optional[Dict[str, str]]:
  function get_linux_distribution_and_version (line 185) | def get_linux_distribution_and_version() -> Optional[str]:
  function probe_operating_system (line 204) | def probe_operating_system() -> str:
  function get_libmxnet_features (line 219) | def get_libmxnet_features(libmxnet_path: str) -> Optional[Dict[str, bool]]:
  function get_cuda_version (line 259) | def get_cuda_version() -> Optional[str]:
  function probe_cpu_variant (line 287) | def probe_cpu_variant(mxnet_features: Dict[str, bool]) -> str:
  function probe_gpu_variant (line 302) | def probe_gpu_variant(mxnet_features: Dict[str, bool]) -> Optional[str]:
  function probe_mxnet_variant (line 323) | def probe_mxnet_variant(limxnet_path: str) -> Optional[str]:
  function probe_artifact_repository_bucket (line 340) | def probe_artifact_repository_bucket() -> Optional[str]:
  function probe (line 352) | def probe(args: argparse.Namespace) -> argparse.Namespace:
  function get_s3_key_prefix (line 392) | def get_s3_key_prefix(args: argparse.Namespace, subdir: str = '') -> str:
  function push_artifact (line 405) | def push_artifact(args: argparse.Namespace):
  function pull_artifact (line 450) | def pull_artifact(args: argparse.Namespace):
  function is_file (line 480) | def is_file(path: str) -> str:
  function sanitize_path_array (line 492) | def sanitize_path_array(paths: List[str]) -> List[str]:
  function main (line 503) | def main() -> int:

FILE: cd/utils/test_artifact_repository.py
  class TestArtifactRepositoryTool (line 26) | class TestArtifactRepositoryTool(unittest.TestCase):
    method create_argparse_namespace (line 29) | def create_argparse_namespace(libmxnet_path: Optional[str] = 'path_to_...
    method test_get_commit_id_from_cmd_returns_none_on_fail (line 53) | def test_get_commit_id_from_cmd_returns_none_on_fail(self, mock):
    method test_probe_commit_id_mxnet_sha (line 61) | def test_probe_commit_id_mxnet_sha(self):
    method test_probe_commit_id_git_commit (line 68) | def test_probe_commit_id_git_commit(self):
    method test_probe_commit_id_git_cmd (line 80) | def test_probe_commit_id_git_cmd(self, mock):
    method test_get_linux_os_release_properties (line 88) | def test_get_linux_os_release_properties(self):
    method test_get_linux_os_release_properties_with_quotes (line 101) | def test_get_linux_os_release_properties_with_quotes(self):
    method test_probe_operating_system_windows (line 116) | def test_probe_operating_system_windows(self, mock):
    method test_probe_operating_system_darwin (line 121) | def test_probe_operating_system_darwin(self, mock):
    method test_probe_operating_system_linux (line 127) | def test_probe_operating_system_linux(self, mock_props, mock_sys):
    method test_get_cuda_version (line 138) | def test_get_cuda_version(self, mock):
    method test_get_cuda_version_not_found (line 152) | def test_get_cuda_version_not_found(self, mock):
    method test_probe_variant_native (line 162) | def test_probe_variant_native(self, mock_features):
    method test_probe_variant_cpu (line 170) | def test_probe_variant_cpu(self, mock_features):
    method test_probe_variant_cuda (line 179) | def test_probe_variant_cuda(self, mock_cuda_version, mock_features):
    method test_probe_variant_cuda_returns_none_on_no_features (line 188) | def test_probe_variant_cuda_returns_none_on_no_features(self, mock_fea...
    method test_probe_variant_cuda_mkl (line 197) | def test_probe_variant_cuda_mkl(self, mock_cuda_version, mock_features):
    method test_probe_artifact_repository_bucket (line 206) | def test_probe_artifact_repository_bucket(self):
    method test_probe_no_commit_id (line 214) | def test_probe_no_commit_id(self, mock):
    method test_probe_no_commit_id_failed (line 225) | def test_probe_no_commit_id_failed(self, mock):
    method test_probe_no_operating_system (line 236) | def test_probe_no_operating_system(self, mock):
    method test_probe_no_operating_system_failed (line 247) | def test_probe_no_operating_system_failed(self, mock):
    method test_probe_no_variant (line 257) | def test_probe_no_variant(self, mock):
    method test_probe_no_mxnet_variant_failed (line 268) | def test_probe_no_mxnet_variant_failed(self, mock):
    method test_probe_no_bucket (line 278) | def test_probe_no_bucket(self, mock):
    method test_probe_no_bucket_failed (line 289) | def test_probe_no_bucket_failed(self, mock):
    method test_get_s3_key_prefix (line 298) | def test_get_s3_key_prefix(self):
    method test_get_s3_key_prefix_with_subdir (line 309) | def test_get_s3_key_prefix_with_subdir(self):
    method test_try_s3_download_fails_on_bad_response (line 321) | def test_try_s3_download_fails_on_bad_response(self, mock_s3):
    method test_try_s3_download_returns_false_on_no_keys (line 334) | def test_try_s3_download_returns_false_on_no_keys(self, mock_s3):
    method test_try_s3_download_with_destination (line 346) | def test_try_s3_download_with_destination(self, mock_s3, mock_makedirs):
    method test_try_s3_download (line 385) | def test_try_s3_download(self, mock_s3, mock_makedirs):
    method test_s3_upload (line 423) | def test_s3_upload(self, mock_s3):
    method test_is_file_is_file (line 447) | def test_is_file_is_file(self, mock_exists, mock_isfile):
    method test_is_file_not_file (line 457) | def test_is_file_not_file(self, mock_exists, mock_isfile):
    method test_is_file_not_found (line 466) | def test_is_file_not_found(self, mock_exists):
    method test_sanitize_path_array_empty_paths (line 475) | def test_sanitize_path_array_empty_paths(self):
    method test_sanitize_path_array_directories (line 483) | def test_sanitize_path_array_directories(self, mock_glob, mock_isfile):
    method test_write_libmxnet_meta (line 491) | def test_write_libmxnet_meta(self):
    method test_push_artifact_throws_no_license_error (line 506) | def test_push_artifact_throws_no_license_error(self):

FILE: ci/build.py
  function get_platforms (line 40) | def get_platforms() -> List[str]:
  function get_docker_tag (line 46) | def get_docker_tag(platform: str, registry: str) -> str:
  function build_docker (line 52) | def build_docker(platform: str, registry: str, num_retries: int, no_cach...
  function buildir (line 83) | def buildir() -> str:
  function default_ccache_dir (line 87) | def default_ccache_dir() -> str:
  function container_run (line 106) | def container_run(platform: str,
  function list_platforms (line 181) | def list_platforms() -> str:
  function load_docker_cache (line 185) | def load_docker_cache(platform, tag, docker_registry) -> None:
  function log_environment (line 197) | def log_environment():
  function main (line 205) | def main() -> int:

FILE: ci/build_windows.py
  class BuildFlavour (line 49) | class BuildFlavour(Enum):
  function windows_build (line 141) | def windows_build(args):
  function windows_package (line 202) | def windows_package(args):
  function nix_build (line 229) | def nix_build(args):
  function main (line 245) | def main():

FILE: ci/dev_menu.py
  class Confirm (line 41) | class Confirm(object):
    method __init__ (line 42) | def __init__(self, cmds):
    method __call__ (line 45) | def __call__(self):
  class CMake (line 57) | class CMake(object):
    method __init__ (line 58) | def __init__(self, cmake_options_yaml=DEFAULT_CMAKE_OPTIONS, cmake_opt...
    method cmake_command (line 67) | def cmake_command(self) -> str:
    method __call__ (line 75) | def __call__(self, build_dir='build', generator='Ninja', build_cmd='ni...
  function create_virtualenv (line 89) | def create_virtualenv(venv_exe, pyexe, venv) -> None:
  function create_virtualenv_default (line 97) | def create_virtualenv_default():
  function provision_virtualenv (line 102) | def provision_virtualenv(venv_path=DEFAULT_PYENV):
  function clip (line 163) | def clip(x, mini, maxi):
  function show_menu (line 167) | def show_menu(items: List[str], header=None) -> int:
  function handle_commands (line 182) | def handle_commands(cmds) -> None:
  function use_menu_ui (line 197) | def use_menu_ui(args) -> None:
  function build (line 205) | def build(args) -> None:
  function main (line 221) | def main():

FILE: ci/docker_login.py
  function _get_dockerhub_credentials (line 34) | def _get_dockerhub_credentials(secret_name: str, secret_endpoint_url: st...
  function login_dockerhub (line 64) | def login_dockerhub(secret_name: str, secret_endpoint_url: str, secret_e...
  function logout_dockerhub (line 89) | def logout_dockerhub():
  function main (line 99) | def main(command_line_arguments):

FILE: ci/publish/scala/buildkey.py
  function getCredentials (line 37) | def getCredentials():
  function importASC (line 75) | def importASC(key, gpgPassphrase):
  function encryptMasterPSW (line 85) | def encryptMasterPSW(password):
  function encryptPSW (line 98) | def encryptPSW(password):
  function masterPSW (line 111) | def masterPSW(password):
  function serverPSW (line 117) | def serverPSW(username, password, gpgPassphrase):

FILE: ci/test_docker_login.py
  function mock_boto (line 41) | def mock_boto(num_calls: int = 1):
  class TestDockerLogin (line 59) | class TestDockerLogin(unittest.TestCase):
    method test_docker_login_success (line 62) | def test_docker_login_success(self, mock_run):
    method test_docker_login_retry (line 95) | def test_docker_login_retry(self, mock_sleep, mock_run):
    method test_docker_login_retry_exhausted (line 134) | def test_docker_login_retry_exhausted(self, mock_sleep, mock_run):
    method test_docker_login_failed (line 166) | def test_docker_login_failed(self, mock_run):
    method test_logout (line 185) | def test_logout(self, mock_call):
    method test_main_exit (line 195) | def test_main_exit(self, mock_login):
    method test_main_default_argument_values (line 204) | def test_main_default_argument_values(self, mock_login):

FILE: ci/util.py
  function get_mxnet_root (line 28) | def get_mxnet_root() -> str:
  function remember_cwd (line 43) | def remember_cwd():
  function retry (line 52) | def retry(target_exception, tries=4, delay_s=1, backoff=2):
  function under_ci (line 92) | def under_ci() -> bool:
  function ec2_instance_info (line 97) | def ec2_instance_info() -> str:
  function chdir_to_script_directory (line 120) | def chdir_to_script_directory():
  function script_name (line 127) | def script_name() -> str:
  function config_logging (line 132) | def config_logging():
  function download_file (line 143) | def download_file(url, dest_path):
  function run_command (line 160) | def run_command(args, shell=False):

FILE: conftest.py
  function pytest_configure (line 33) | def pytest_configure(config):
  function pytest_sessionfinish (line 41) | def pytest_sessionfinish(session, exitstatus):
  function pytest_runtest_makereport (line 47) | def pytest_runtest_makereport(item, call):
  function module_scope_waitall (line 62) | def module_scope_waitall(request):
  function module_scope_seed (line 76) | def module_scope_seed(request):
  function function_scope_seed (line 157) | def function_scope_seed(request):

FILE: contrib/tvmop/basic/ufunc.py
  function compute_add (line 23) | def compute_add(dtype, ndim):
  function vadd (line 34) | def vadd(dtype, ndim):
  function vadd_gpu (line 45) | def vadd_gpu(dtype, ndim):
  function compute_backward_vadd (line 56) | def compute_backward_vadd(dtype, ndim, reduce1st, req):
  function backward_vadd (line 77) | def backward_vadd(dtype, ndim, reduce1st, req):
  function backward_vadd_gpu (line 89) | def backward_vadd_gpu(dtype, ndim, reduce1st, req):
  function compute_degandrad (line 103) | def compute_degandrad(dtype, ndim, n):
  function deg2rad (line 118) | def deg2rad(dtype, ndim):
  function rad2deg (line 128) | def rad2deg(dtype, ndim):
  function deg2rad_gpu (line 138) | def deg2rad_gpu(dtype, ndim):
  function rad2deg_gpu (line 151) | def rad2deg_gpu(dtype, ndim):
  function compute_backward_degandrad (line 162) | def compute_backward_degandrad(dtype, ndim, req, n):
  function backward_deg2rad (line 183) | def backward_deg2rad(dtype, ndim, req):
  function backward_rad2deg (line 195) | def backward_rad2deg(dtype, ndim, req):
  function cuda_backward_deg2rad (line 207) | def cuda_backward_deg2rad(dtype, ndim, req):
  function cuda_backward_rad2deg (line 224) | def cuda_backward_rad2deg(dtype, ndim, req):

FILE: contrib/tvmop/compile.py
  function create_shared (line 37) | def create_shared(output,
  function _linux_compile (line 60) | def _linux_compile(output, objects, options, compile_cmd="g++"):
  function get_target (line 84) | def get_target(device):
  function get_cuda_arch (line 92) | def get_cuda_arch(arch):

FILE: contrib/tvmop/core/fromnumeric.py
  function _compute_sum (line 24) | def _compute_sum(itype, otype, ndim, reduce1st_dim, req):
  function _sum_cpu (line 37) | def _sum_cpu(itype, otype, ndim, reduce1st_dim, req):
  function _sum_gpu (line 51) | def _sum_gpu(itype, otype, ndim, reduce1st_dim, req):

FILE: contrib/tvmop/core/multiarray.py
  function compute_dot (line 24) | def compute_dot(A, B):
  function dot (line 36) | def dot(dtype, fallback):

FILE: contrib/tvmop/core/umath.py
  function _compute_binary_logic (line 34) | def _compute_binary_logic(op, dtype, ndim):
  function _binary_logic_cpu (line 60) | def _binary_logic_cpu(compute_func, op, itype, ndim):
  function _binary_logic_gpu (line 68) | def _binary_logic_gpu(compute_func, op, itype, ndim):
  function _compute_binary_scalar_logic (line 99) | def _compute_binary_scalar_logic(op, dtype, ndim):

FILE: contrib/tvmop/opdef.py
  class OpDef (line 26) | class OpDef:
    method __init__ (line 52) | def __init__(self, func, name, target, auto_broadcast, **kwargs):
    method __call__ (line 75) | def __call__(self, *args, **kwargs):
    method invoke_all (line 78) | def invoke_all(self):
    method get_op_name (line 104) | def get_op_name(self, name, args):
    method get_config_spaces (line 107) | def get_config_spaces(self):
    method get_binds (line 117) | def get_binds(self, args):
  function defop (line 124) | def defop(name, target=None, auto_broadcast=False, **kwargs):

FILE: contrib/tvmop/space.py
  class OtherOptionSpace (line 23) | class OtherOptionSpace(object):
    method __init__ (line 25) | def __init__(self, entities):
    method from_tvm (line 29) | def from_tvm(cls, x):
    method __len__ (line 32) | def __len__(self):
    method __repr__ (line 35) | def __repr__(self):
  class OtherOptionEntity (line 39) | class OtherOptionEntity(object):
    method __init__ (line 41) | def __init__(self, val):
    method from_tvm (line 45) | def from_tvm(cls, x):
    method __repr__ (line 62) | def __repr__(self):
  class ConfigSpace (line 66) | class ConfigSpace(object):
    method __init__ (line 68) | def __init__(self, space_map, _entity_map):
    method from_tvm (line 74) | def from_tvm(cls, x):
    method __len__ (line 93) | def __len__(self):
    method __repr__ (line 99) | def __repr__(self):
    method to_json_dict (line 105) | def to_json_dict(self):
    method from_json_dict (line 129) | def from_json_dict(cls, json_dict):
  class ConfigSpaces (line 163) | class ConfigSpaces(object):
    method __init__ (line 165) | def __init__(self):
    method __setitem__ (line 168) | def __setitem__(self, name, space):
    method __len__ (line 171) | def __len__(self):
    method __repr__ (line 174) | def __repr__(self):
    method to_json_dict (line 180) | def to_json_dict(self):
    method from_json_dict (line 194) | def from_json_dict(cls, json_dict):

FILE: contrib/tvmop/utils.py
  function assign_by_req (line 26) | def assign_by_req(a, req, otype=None):
  function reduce_axes (line 36) | def reduce_axes(X, axes, reducer, atype=None):

FILE: cpp-package/example/alexnet.cpp
  function Symbol (line 32) | Symbol AlexnetSymbol(int num_classes) {
  function NDArray (line 199) | NDArray ResizeInput(NDArray data, const Shape new_shape) {
  function main (line 213) | int main(int argc, char const *argv[]) {

FILE: cpp-package/example/charRNN.cpp
  type LSTMState (line 51) | struct LSTMState {
  type LSTMParam (line 56) | struct LSTMParam {
  function LSTMState (line 66) | LSTMState LSTM(int num_hidden, const Symbol& indata, const LSTMState& pr...
  function Symbol (line 87) | Symbol LSTMUnroll(int num_lstm_layer, int sequence_length, int input_dim,
  function Symbol (line 150) | Symbol LSTMWithBuiltInRNNOp(int num_lstm_layer, int sequence_length, int...
  class Shuffler (line 194) | class Shuffler {
    method Shuffler (line 197) | explicit Shuffler(int size) : sequence(size) {
    method shuffle (line 202) | void shuffle(std::function<void(int, int)> lambda = nullptr) {
  class BucketSentenceIter (line 216) | class BucketSentenceIter : public DataIter {
    method BucketSentenceIter (line 226) | BucketSentenceIter(std::string filename, int minibatch, Context contex...
    method maxSequenceLength (line 248) | unsigned int maxSequenceLength() {
    method characterSize (line 252) | size_t characterSize() {
    method Next (line 256) | virtual bool Next(void) {
    method NDArray (line 259) | virtual NDArray GetData(void) {
    method NDArray (line 274) | virtual NDArray GetLabel(void) {
    method GetPadNum (line 289) | virtual int GetPadNum(void) {
    method GetIndex (line 292) | virtual std::vector<int> GetIndex(void) {
    method BeforeFirst (line 297) | virtual void BeforeFirst(void) {
    method readContent (line 302) | std::wstring readContent(const std::string file) {
    method buildCharIndex (line 312) | void buildCharIndex(const std::wstring& content) {
    method wchar_t (line 340) | inline wchar_t character(int i) {
    method mx_float (line 344) | inline mx_float index(wchar_t c) {
    method saveCharIndices (line 348) | void saveCharIndices(const std::string file) {
    method loadCharIndices (line 356) | static std::tuple<std::unordered_map<wchar_t, mx_float>, std::vector<w...
    method convertTextToSequences (line 376) | std::vector<std::vector<mx_float>>
  function OutputPerplexity (line 389) | void OutputPerplexity(NDArray* labels, NDArray* output) {
  function SaveCheckpoint (line 407) | void SaveCheckpoint(const std::string filepath, Symbol net, Executor* ex...
  function LoadCheckpoint (line 419) | void LoadCheckpoint(const std::string filepath, Executor* exe) {
  function train (line 441) | void train(const std::string file, int batch_size, int max_epoch, int st...
  class RNNXavier (line 521) | class RNNXavier : public Xavier {
    method RNNXavier (line 523) | RNNXavier(RandType rand_type = gaussian, FactorType factor_type = avg,
    method InitDefault (line 528) | virtual void InitDefault(NDArray* arr) {
  function trainWithBuiltInRNNOp (line 533) | void trainWithBuiltInRNNOp(const std::string file, int batch_size, int m...
  function predict (line 596) | void predict(std::wstring* ptext, int sequence_length, const std::string...
  function predictWithBuiltInRNNOp (line 660) | void predictWithBuiltInRNNOp(std::wstring* ptext, int sequence_length, c...
  function main (line 716) | int main(int argc, char** argv) {

FILE: cpp-package/example/feature_extract/feature_extract.cpp
  class FeatureExtractor (line 41) | class FeatureExtractor {
    method GetFeatureSymbol (line 51) | void GetFeatureSymbol() {
    method LoadParameters (line 65) | void LoadParameters() {
    method GetMeanImg (line 81) | void GetMeanImg() {
    method FeatureExtractor (line 90) | FeatureExtractor() {
    method Extract (line 97) | void Extract(NDArray data) {
  function NDArray (line 117) | NDArray Data2NDArray() {
  function main (line 128) | int main() {

FILE: cpp-package/example/feature_extract/prepare_data_with_opencv.cpp
  function Mat2Array (line 31) | void Mat2Array() {
  function main (line 52) | int main(int argc, char *argv[]) {

FILE: cpp-package/example/googlenet.cpp
  function Symbol (line 31) | Symbol ConvFactory(Symbol data, int num_filter,
  function Symbol (line 45) | Symbol InceptionFactory(Symbol data, int num_1x1, int num_3x3red,
  function Symbol (line 78) | Symbol GoogleNetSymbol(int num_classes) {
  function main (line 116) | int main(int argc, char const *argv[]) {

FILE: cpp-package/example/inception_bn.cpp
  function Symbol (line 31) | Symbol ConvFactoryBN(Symbol data, int num_filter,
  function Symbol (line 49) | Symbol InceptionFactoryA(Symbol data, int num_1x1, int num_3x3red,
  function Symbol (line 79) | Symbol InceptionFactoryB(Symbol data, int num_3x3red, int num_3x3,
  function Symbol (line 103) | Symbol InceptionSymbol(int num_classes) {
  function NDArray (line 145) | NDArray ResizeInput(NDArray data, const Shape new_shape) {
  function main (line 159) | int main(int argc, char const *argv[]) {

FILE: cpp-package/example/inference/imagenet_inference.cpp
  function ms_now (line 47) | double ms_now() {
  type TypeFlag (line 62) | enum TypeFlag {
  class Predictor (line 78) | class Predictor {
    method Predictor (line 80) | Predictor() {}
    method FileExists (line 111) | inline bool FileExists(const std::string &name) {
  function createVectorFromString (line 525) | std::vector<T> createVectorFromString(const std::string& input_string) {
  function printUsage (line 548) | void printUsage() {
  function main (line 570) | int main(int argc, char** argv) {

FILE: cpp-package/example/inference/multi_threaded_inference/get_model.py
  function download (line 27) | def download(url, fname=None, dirname=None, overwrite=False, retries=5):
  function download_model (line 97) | def download_model(model_name, dst_dir='./', meta_info=None):
  function main (line 163) | def main():

FILE: cpp-package/example/inference/multi_threaded_inference/multi_threaded_inference.cc
  function trim (line 43) | static std::string trim(const std::string& input) {
  function LoadSynset (line 51) | std::vector<std::string> LoadSynset(const std::string& synset_file) {
  function PrintOutputResult (line 72) | void PrintOutputResult(const float* data, size_t size, const std::vector...
  function GetImageFile (line 92) | void GetImageFile(const std::string& image_file,
  function prepare_input_data (line 128) | void prepare_input_data(const mxnet::cpp::Shape& shape,
  function run_inference (line 145) | void run_inference(const std::string& model_name,
  function main (line 305) | int main(int argc, char* argv[]) {

FILE: cpp-package/example/inference/sentiment_analysis_rnn.cpp
  class Predictor (line 55) | class Predictor {
    method Predictor (line 57) | Predictor() {}
    method FileExists (line 70) | inline bool FileExists(const std::string& name) {
  function printUsage (line 396) | void printUsage() {
  function DownloadFiles (line 413) | void DownloadFiles(const std::vector<std::string> model_files) {
  function main (line 426) | int main(int argc, char** argv) {

FILE: cpp-package/example/lenet.cpp
  function ctx_dev (line 39) | ctx_dev(Context(DeviceType::kGPU, 0))
  function Run (line 43) | void Run(int max_epoch) {
  function GetData (line 189) | size_t GetData(std::vector<float> *data, std::vector<float> *label) {
  function ValAccuracy (line 209) | float ValAccuracy(int batch_size, Symbol lenet) {
  function main (line 260) | int main(int argc, char const *argv[]) {

FILE: cpp-package/example/lenet_with_mxdataiter.cpp
  function Symbol (line 33) | Symbol LenetSymbol() {
  function NDArray (line 69) | NDArray ResizeInput(NDArray data, const Shape new_shape) {
  function main (line 79) | int main(int argc, char const *argv[]) {

FILE: cpp-package/example/mlp.cpp
  function OutputAccuracy (line 37) | void OutputAccuracy(mx_float* pred, mx_float* target) {
  function MLP (line 53) | void MLP(int max_epoch) {
  function main (line 175) | int main(int argc, char** argv) {

FILE: cpp-package/example/mlp_cpu.cpp
  function Symbol (line 29) | Symbol mlp(const std::vector<int> &layers) {
  function main (line 51) | int main(int argc, char** argv) {

FILE: cpp-package/example/mlp_csv.cpp
  function Symbol (line 37) | Symbol mlp(const std::vector<int> &hidden_units) {
  function getLayers (line 61) | std::vector<int> getLayers(const std::string& hidden_units_string) {
  function printUsage (line 73) | void printUsage() {
  function main (line 83) | int main(int argc, char** argv) {

FILE: cpp-package/example/mlp_gpu.cpp
  function Symbol (line 29) | Symbol mlp(const std::vector<int> &layers) {
  function main (line 51) | int main(int argc, char** argv) {

FILE: cpp-package/example/mnist_to_csv.py
  function convert_to_csv (line 26) | def convert_to_csv(args):

FILE: cpp-package/example/resnet.cpp
  function Symbol (line 32) | Symbol ConvolutionNoBias(const std::string& symbol_name,
  function Symbol (line 56) | Symbol getConv(const std::string & name, Symbol data,
  function Symbol (line 81) | Symbol makeBlock(const std::string & name, Symbol data, int num_filter,
  function Symbol (line 114) | Symbol getBody(Symbol data, int num_level, int num_block, int num_filter...
  function Symbol (line 125) | Symbol ResNetSymbol(int num_class, int num_level = 3, int num_block = 9,
  function NDArray (line 156) | NDArray ResizeInput(NDArray data, const Shape new_shape) {
  function main (line 170) | int main(int argc, char const *argv[]) {

FILE: cpp-package/example/test_kvstore.cpp
  function test_single_key (line 24) | static bool test_single_key(const Context &context, const std::string &c...
  function test_multiple_key (line 83) | static bool test_multiple_key(const Context &context, const std::string ...
  function main (line 175) | int main(int argc, char** argv) {

FILE: cpp-package/example/test_ndarray_copy.cpp
  type TypeFlag (line 26) | enum TypeFlag {
  function main (line 41) | int main(int argc, char** argv) {

FILE: cpp-package/example/test_optimizer.cpp
  function main (line 27) | int main(int argc, char** argv) {

FILE: cpp-package/example/test_regress_label.cpp
  function main (line 31) | int main() {

FILE: cpp-package/example/test_score.cpp
  function Symbol (line 34) | Symbol mlp(const std::vector<int> &layers) {
  function main (line 56) | int main(int argc, char** argv) {

FILE: cpp-package/example/utils.h
  function isFileExists (line 39) | bool isFileExists(const std::string& filename) {
  function check_datafiles (line 44) | bool check_datafiles(const std::vector<std::string>& data_files) {
  function setDataIter (line 54) | bool setDataIter(MXDataIter* iter,

FILE: cpp-package/include/mxnet-cpp/base.h
  function namespace (line 33) | namespace mxnet {

FILE: cpp-package/include/mxnet-cpp/contrib.h
  function namespace (line 34) | namespace mxnet {

FILE: cpp-package/include/mxnet-cpp/executor.h
  function namespace (line 37) | namespace mxnet {

FILE: cpp-package/include/mxnet-cpp/executor.hpp
  type mxnet (line 36) | namespace mxnet {
    type cpp (line 37) | namespace cpp {

FILE: cpp-package/include/mxnet-cpp/initializer.h
  function class (line 38) | class Initializer {
  function class (line 119) | class Constant : public Initializer {
  function class (line 130) | class Zero : public Constant {
  function class (line 135) | class One : public Constant {
  function class (line 140) | class Uniform : public Initializer {

FILE: cpp-package/include/mxnet-cpp/io.h
  function namespace (line 36) | namespace mxnet {

FILE: cpp-package/include/mxnet-cpp/io.hpp
  type mxnet (line 32) | namespace mxnet {
    type cpp (line 33) | namespace cpp {
      function MXDataIterMap (line 35) | inline MXDataIterMap*& MXDataIter::mxdataiter_map() {
      function NDArray (line 57) | inline NDArray MXDataIter::GetData() {
      function NDArray (line 64) | inline NDArray MXDataIter::GetLabel() {
      function MXDataIter (line 88) | inline MXDataIter MXDataIter::CreateDataIter() {

FILE: cpp-package/include/mxnet-cpp/kvstore.h
  function namespace (line 33) | namespace mxnet {

FILE: cpp-package/include/mxnet-cpp/kvstore.hpp
  type mxnet (line 38) | namespace mxnet {
    type cpp (line 39) | namespace cpp {
      function KVStoreHandle (line 59) | inline KVStoreHandle& KVStore::get_handle() {
      function KVStore (line 69) | inline KVStore*& KVStore::get_kvstore() {

FILE: cpp-package/include/mxnet-cpp/lr_scheduler.h
  function namespace (line 30) | namespace mxnet {

FILE: cpp-package/include/mxnet-cpp/metric.h
  function namespace (line 37) | namespace cpp {

FILE: cpp-package/include/mxnet-cpp/model.h
  function namespace (line 35) | namespace mxnet {

FILE: cpp-package/include/mxnet-cpp/ndarray.h
  function namespace (line 37) | namespace mxnet {
  type NDBlob (line 92) | struct NDBlob {

FILE: cpp-package/include/mxnet-cpp/ndarray.hpp
  type mxnet (line 38) | namespace mxnet {
    type cpp (line 39) | namespace cpp {
      function NDArray (line 96) | inline NDArray NDArray::operator+(mx_float scalar) {
      function NDArray (line 101) | inline NDArray NDArray::operator-(mx_float scalar) {
      function NDArray (line 106) | inline NDArray NDArray::operator*(mx_float scalar) {
      function NDArray (line 111) | inline NDArray NDArray::operator/(mx_float scalar) {
      function NDArray (line 116) | inline NDArray NDArray::operator%(mx_float scalar) {
      function NDArray (line 121) | inline NDArray NDArray::operator+(const NDArray &rhs) {
      function NDArray (line 126) | inline NDArray NDArray::operator-(const NDArray &rhs) {
      function NDArray (line 131) | inline NDArray NDArray::operator*(const NDArray &rhs) {
      function NDArray (line 136) | inline NDArray NDArray::operator/(const NDArray &rhs) {
      function NDArray (line 141) | inline NDArray NDArray::operator%(const NDArray &rhs) {
      function NDArray (line 146) | inline NDArray &NDArray::operator=(mx_float scalar) {
      function NDArray (line 150) | inline NDArray &NDArray::operator+=(mx_float scalar) {
      function NDArray (line 154) | inline NDArray &NDArray::operator-=(mx_float scalar) {
      function NDArray (line 158) | inline NDArray &NDArray::operator*=(mx_float scalar) {
      function NDArray (line 162) | inline NDArray &NDArray::operator/=(mx_float scalar) {
      function NDArray (line 166) | inline NDArray &NDArray::operator%=(mx_float scalar) {
      function NDArray (line 170) | inline NDArray &NDArray::operator+=(const NDArray &rhs) {
      function NDArray (line 174) | inline NDArray &NDArray::operator-=(const NDArray &rhs) {
      function NDArray (line 178) | inline NDArray &NDArray::operator*=(const NDArray &rhs) {
      function NDArray (line 182) | inline NDArray &NDArray::operator/=(const NDArray &rhs) {
      function NDArray (line 186) | inline NDArray &NDArray::operator%=(const NDArray &rhs) {
      function NDArray (line 191) | inline NDArray NDArray::ArgmaxChannel() {
      function NDArray (line 211) | inline NDArray NDArray::Copy(const Context &ctx) const {
      function NDArray (line 216) | inline NDArray NDArray::CopyTo(NDArray * other) const {
      function NDArray (line 220) | inline NDArray NDArray::Slice(mx_uint begin, mx_uint end) const {
      function NDArray (line 225) | inline NDArray NDArray::Reshape(const Shape &new_shape) const {
      function mx_float (line 390) | inline mx_float NDArray::At(size_t h, size_t w) const {
      function mx_float (line 394) | inline mx_float NDArray::At(size_t c, size_t h, size_t w) const {
      function mx_float (line 398) | inline mx_float NDArray::At(size_t index) const {
      function mx_float (line 428) | inline const mx_float *NDArray::GetData() const {
      function Context (line 437) | inline Context NDArray::GetContext() const {

FILE: cpp-package/include/mxnet-cpp/op_map.h
  function namespace (line 34) | namespace mxnet {

FILE: cpp-package/include/mxnet-cpp/op_suppl.h
  function namespace (line 37) | namespace mxnet {

FILE: cpp-package/include/mxnet-cpp/op_util.h
  function namespace (line 36) | namespace mxnet {

FILE: cpp-package/include/mxnet-cpp/operator.h
  function namespace (line 36) | namespace mxnet {

FILE: cpp-package/include/mxnet-cpp/operator.hpp
  type mxnet (line 37) | namespace mxnet {
    type cpp (line 38) | namespace cpp {
      function Operator (line 45) | inline Operator& Operator::SetParam<NDArray>(int pos, const NDArray ...
      function Operator (line 50) | inline Operator& Operator::SetParam<Symbol>(int pos, const Symbol &v...
      function OpMap (line 55) | inline OpMap*& Operator::op_map() {
      function Symbol (line 82) | inline Symbol Operator::CreateSymbol(const std::string &name) {
      function Operator (line 163) | inline Operator &Operator::SetInput(const std::string &name, const S...
      function Operator (line 171) | inline Operator &Operator::SetInput(const std::string &name, const N...

FILE: cpp-package/include/mxnet-cpp/optimizer.h
  function namespace (line 41) | namespace mxnet {

FILE: cpp-package/include/mxnet-cpp/optimizer.hpp
  function _clip (line 44) | inline void _clip(mxnet::cpp::NDArray &data, float limit) {
  function _sqrt (line 51) | inline mxnet::cpp::NDArray _sqrt(mxnet::cpp::NDArray data) {
  type mxnet (line 59) | namespace mxnet {
    type cpp (line 60) | namespace cpp {
      function OpMap (line 73) | inline OpMap*& Optimizer::op_map() {
      function Optimizer (line 128) | inline Optimizer* OptimizerRegistry::Find(const std::string& name) {

FILE: cpp-package/include/mxnet-cpp/shape.h
  function namespace (line 35) | namespace mxnet {
  function ndim_ (line 170) | Shape(const Shape& s) : ndim_(s.ndim_) {
  function CopyFrom (line 207) | void CopyFrom(RandomAccessIterator begin, RandomAccessIterator end) {
  function index_t (line 232) | inline const index_t* data() const {
  function index_t (line 236) | inline index_t* data() {
  function index_t (line 240) | inline index_t ndim(void) const {
  function index_t (line 248) | inline index_t& operator[](index_t i) {
  function index_t (line 256) | inline const index_t& operator[](index_t i) const {
  function Size (line 260) | inline size_t Size(void) const {
  function operator (line 272) | inline bool operator==(const Shape& s) const {
  function operator (line 292) | inline bool operator!=(const Shape& s) const {
  function SetDim (line 317) | inline void SetDim(index_t dim) {

FILE: cpp-package/include/mxnet-cpp/symbol.h
  function namespace (line 36) | namespace mxnet {

FILE: cpp-package/include/mxnet-cpp/symbol.hpp
  type mxnet (line 39) | namespace mxnet {
    type cpp (line 40) | namespace cpp {
      function OpMap (line 41) | inline OpMap*& Symbol::op_map() {
      function Symbol (line 54) | inline Symbol Symbol::Variable(const std::string &name) { return Sym...
      function Symbol (line 55) | inline Symbol Symbol::operator+(const Symbol &rhs) const { return _P...
      function Symbol (line 56) | inline Symbol Symbol::operator-(const Symbol &rhs) const { return _M...
      function Symbol (line 57) | inline Symbol Symbol::operator*(const Symbol &rhs) const { return _M...
      function Symbol (line 58) | inline Symbol Symbol::operator/(const Symbol &rhs) const { return _D...
      function Symbol (line 59) | inline Symbol Symbol::operator%(const Symbol &rhs) const { return _M...
      function Symbol (line 60) | inline Symbol Symbol::operator+(mx_float scalar) const {
      function Symbol (line 63) | inline Symbol Symbol::operator-(mx_float scalar) const {
      function Symbol (line 66) | inline Symbol Symbol::operator*(mx_float scalar) const {
      function Symbol (line 69) | inline Symbol Symbol::operator/(mx_float scalar) const {
      function Symbol (line 72) | inline Symbol Symbol::operator%(mx_float scalar) const {
      function Symbol (line 75) | inline Symbol Symbol::operator[](int index) {
      function Symbol (line 80) | inline Symbol Symbol::operator[](const std::string &index) {
      function Symbol (line 90) | inline Symbol Symbol::Group(const std::vector<Symbol> &symbols) {
      function Symbol (line 99) | inline Symbol Symbol::Load(const std::string &file_name) {
      function Symbol (line 105) | inline Symbol Symbol::LoadJSON(const std::string &json_str) {
      function Symbol (line 119) | inline Symbol Symbol::GetInternals() const {
      function Symbol (line 138) | inline Symbol Symbol::Copy() const {
      function mx_uint (line 210) | inline mx_uint Symbol::GetNumOutputs() const {
      function Executor (line 382) | inline Executor *Symbol::SimpleBind(
      function Executor (line 400) | inline Executor *Symbol::Bind(const Context &context,
      function Symbol (line 410) | inline Symbol operator+(mx_float lhs, const Symbol &rhs) { return rh...
      function Symbol (line 411) | inline Symbol operator-(mx_float lhs, const Symbol &rhs) {
      function Symbol (line 414) | inline Symbol operator*(mx_float lhs, const Symbol &rhs) { return rh...
      function Symbol (line 415) | inline Symbol operator/(mx_float lhs, const Symbol &rhs) {
      function Symbol (line 418) | inline Symbol operator%(mx_float lhs, const Symbol &rhs) {

FILE: cpp-package/scripts/OpWrapperGenerator.py
  function gen_enum_value (line 35) | def gen_enum_value(value):
  class EnumType (line 38) | class EnumType:
    method __init__ (line 41) | def __init__(self, typeName = 'ElementWiseOpType', \
    method GetDefinitionString (line 52) | def GetDefinitionString(self, indent = 0):
    method GetDefaultValueString (line 62) | def GetDefaultValueString(self, value = ''):
    method GetEnumStringArray (line 64) | def GetEnumStringArray(self, indent = 0):
    method GetConvertEnumVariableToString (line 74) | def GetConvertEnumVariableToString(self, variable=''):
  class Arg (line 78) | class Arg:
    method __init__ (line 112) | def __init__(self, opName = '', argName = '', typeString = '', descStr...
    method MakeCString (line 151) | def MakeCString(self, str):
    method ConstructEnumTypeName (line 156) | def ConstructEnumTypeName(self, opName = '', argName = ''):
  class Op (line 166) | class Op:
    method __init__ (line 171) | def __init__(self, name = '', description = '', args = []):
    method WrapDescription (line 190) | def WrapDescription(self, desc = ''):
    method GenDescription (line 209) | def GenDescription(self, desc = '', \
    method GetOpDefinitionString (line 222) | def GetOpDefinitionString(self, use_name, indent=0):
    method GetArgString (line 302) | def GetArgString(self, arg):
  function ParseAllOps (line 309) | def ParseAllOps():

FILE: cpp-package/scripts/lint.py
  class LintHelper (line 36) | class LintHelper(object):
    method _print_summary_map (line 40) | def _print_summary_map(strm, result_map, ftype):
    method __init__ (line 52) | def __init__(self):
    method process_cpp (line 77) | def process_cpp(self, path, suffix):
    method process_python (line 89) | def process_python(self, path):
    method print_summary (line 107) | def print_summary(self, strm):
  function get_header_guard_dmlc (line 122) | def get_header_guard_dmlc(filename):
  function process (line 147) | def process(fname, allow_type):
  function main (line 161) | def main():

FILE: docs/python_docs/_static/autodoc.js
  function auto_index (line 21) | function auto_index() {

FILE: docs/python_docs/python/scripts/conf.py
  function setup (line 254) | def setup(app):

FILE: docs/python_docs/python/scripts/md2ipynb.py
  function md2ipynb (line 24) | def md2ipynb():

FILE: docs/python_docs/python/scripts/process_rst.py
  function has_token (line 22) | def has_token(token, lines):
  function get_next_title_mark (line 28) | def get_next_title_mark(lines):
  function add_hidden_title (line 37) | def add_hidden_title(inputs):

FILE: docs/python_docs/python/tutorials/getting-started/crash-course/prepare_dataset.py
  function split_file_list (line 30) | def split_file_list(file_list, train_split=0.7, val_split=0.2, test_spli...
  function process_dataset (line 41) | def process_dataset(root_directory, splits=splits, classes=targets, trai...

FILE: docs/python_docs/themes/mx-theme/mxtheme/__init__.py
  function get_path (line 9) | def get_path():
  function setup (line 12) | def setup(app):

FILE: docs/python_docs/themes/mx-theme/mxtheme/card.py
  class card (line 5) | class card(nodes.General, nodes.Element):
  class CardDirective (line 8) | class CardDirective(Directive):
    method run (line 21) | def run(self):

FILE: docs/python_docs/themes/mx-theme/mxtheme/static/sphinx_materialdesign_theme.js
  function f (line 1) | function f(t,n){if(!r[t]){if(!e[t]){var i="function"==typeof parcelRequi...
  function e (line 6) | function e(t){return(e="function"==typeof Symbol&&"symbol"==typeof Symbo...
  function t (line 6) | function t(e,t){if(e){if(t.element_.classList.contains(t.CssClasses_.MDL...
  function s (line 6) | function s(e,t,s,i){function n(){var n=e.href.split("#")[1],a=i.content_...
  function t (line 6) | function t(e,t){for(var s=0;s<r.length;s++)if(r[s].className===e)return ...
  function s (line 6) | function s(e){var t=e.getAttribute("data-upgraded");return null===t?[""]...
  function i (line 6) | function i(e,t){return-1!==s(e).indexOf(t)}
  function n (line 6) | function n(e,t,s){if("CustomEvent"in window&&"function"==typeof window.C...
  function a (line 6) | function a(e,s){if(void 0===e&&void 0===s)for(var i=0;i<r.length;i++)a(r...
  function l (line 6) | function l(a,l){if(!("object"==e(a)&&a instanceof Element))throw new Err...
  function o (line 6) | function o(e){if(e){var t=_.indexOf(e);_.splice(t,1);var s=e.element_.ge...
  function i (line 219) | function i(t){return isFinite(t=+t)&&0!=t?t<0?-i(-t):Math.log(t+Math.sqr...
  function r (line 345) | function r(){}
  function p (line 427) | function p(r,e,n,i,a,u){var c=n+r.length,l=i.length,g=s;return void 0!==...
  function e (line 447) | function e(e){var o,t;this.promise=new e(function(r,e){if(void 0!==o||vo...
  function O (line 486) | function O(t,n,r){var e,i,o,u=new Array(r),f=8*r-n-1,s=(1<<f)-1,c=s>>1,a...
  function R (line 486) | function R(t,n,r){var e,i=8*r-n-1,o=(1<<i)-1,u=o>>1,f=i-7,s=r-1,c=t[s--]...
  function k (line 486) | function k(t){return t[3]<<24|t[2]<<16|t[1]<<8|t[0]}
  function z (line 486) | function z(t){return[255&t]}
  function C (line 486) | function C(t){return[255&t,t>>8&255]}
  function G (line 486) | function G(t){return[255&t,t>>8&255,t>>16&255,t>>24&255]}
  function H (line 486) | function H(t){return O(t,52,8)}
  function J (line 486) | function J(t){return O(t,23,4)}
  function K (line 486) | function K(t,n,r){l(t[w],n,{get:function(){return this[r]}})}
  function P (line 486) | function P(t,n,r,e){var i=a(+r);if(i+n>t[D])throw U(b);var o=t[m]._b,u=i...
  function Q (line 486) | function Q(t,n,r,e,i,o){var u=a(+r);if(u+n>t[D])throw U(b);for(var f=t[m...
  function e (line 515) | function e(){}
  function a (line 523) | function a(i,c){var v,g,l=arguments.length<3?i:arguments[2];return u(i)=...
  function c (line 539) | function c(u,l,n){var q,s,_=arguments.length<4?u:arguments[3],b=r.f(o(u)...
  function u (line 545) | function u(s,a,n,c,f,l,q,_){for(var d,h,p=f,v=0,b=!!q&&t(q,_,3);v<c;){if...
  function o (line 689) | function o(n,o){return c.type="throw",c.arg=t,e.next=n,o&&(e.method="nex...
  function w (line 689) | function w(t,r,e,n){var o=r&&r.prototype instanceof x?r:x,i=Object.creat...
  function L (line 689) | function L(t,r,e){try{return{type:"normal",arg:t.call(r,e)}}catch(n){ret...
  function x (line 689) | function x(){}
  function b (line 689) | function b(){}
  function E (line 689) | function E(){}
  function j (line 689) | function j(t){["next","throw","return"].forEach(function(r){t[r]=functio...
  function _ (line 689) | function _(r){function e(t,o,i,a){var c=L(r[t],r,o);if("throw"!==c.type)...
  function O (line 689) | function O(t,e){var n=t.iterator[e.method];if(n===r){if(e.delegate=null,...
  function k (line 689) | function k(t){var r={tryLoc:t[0]};1 in t&&(r.catchLoc=t[1]),2 in t&&(r.f...
  function G (line 689) | function G(t){var r=t.completion||{};r.type="normal",delete r.arg,t.comp...
  function N (line 689) | function N(t){this.tryEntries=[{tryLoc:"root"}],t.forEach(k,this),this.r...
  function P (line 689) | function P(t){if(t){var e=t[i];if(e)return e.call(t);if("function"==type...
  function S (line 689) | function S(){return{value:r,done:!0}}
  function i (line 699) | function i(e,i,n){e[i]||Object[r](e,i,{writable:!0,configurable:!0,value...
  function t (line 701) | function t(t,e){if(!(t instanceof e))throw new TypeError("Cannot call a ...
  function e (line 701) | function e(t,e){for(var n=0;n<e.length;n++){var s=e[n];s.enumerable=s.en...
  function n (line 701) | function n(t,n,s){return n&&e(t.prototype,n),s&&e(t,s),t}
  function e (line 701) | function e(n){t(this,e),this.doc=document,this.nav=this.doc.querySelecto...
  function t (line 703) | function t(a){return a&&a.__esModule?a:{default:a}}

FILE: docs/python_docs/themes/mx-theme/src/js/adjust-height.js
  class AdjustHeight (line 1) | class AdjustHeight {
    method constructor (line 3) | constructor() {
    method adjust (line 16) | adjust() {
    method setPageContentMinHeight (line 21) | setPageContentMinHeight() {
    method setLocaltocHeight (line 29) | setLocaltocHeight() {
    method attachEvent (line 40) | attachEvent() {

FILE: docs/python_docs/themes/mx-theme/src/js/scrollspy.js
  class ScrollSpy (line 1) | class ScrollSpy {
    method constructor (line 2) | constructor(args) {
    method attachEvent (line 22) | attachEvent() {
    method getContents (line 62) | getContents(contentSelector) {
    method spy (line 71) | spy() {
    method getViewState (line 76) | getViewState() {
    method isView (line 88) | isView(element) {
    method toggleNavClass (line 100) | toggleNavClass(elements) {
    method getTagDepth (line 129) | getTagDepth(element) {

FILE: docs/python_docs/themes/mx-theme/src/js/sphinx_materialdesign_theme.js
  function reconstructionDrawerGlobalToc (line 9) | function reconstructionDrawerGlobalToc() {
  function collapse (line 43) | function collapse() {
  function styleMdlCodeBlock (line 52) | function styleMdlCodeBlock() {
  function copyClipboard (line 64) | function copyClipboard(selector) {
  function quickSearchClickEvent (line 81) | function quickSearchClickEvent() {

FILE: docs/static_site/src/_plugins/markdowner.rb
  type Jekyll (line 18) | module Jekyll
    class MarkdownBlock (line 19) | class MarkdownBlock < Liquid::Block
      method initialize (line 20) | def initialize(tag_name, text, tokens)
      method render (line 24) | def render(context)

FILE: docs/static_site/src/assets/js/clipboard.js
  function n (line 25) | function n(){}
  function r (line 25) | function r(){o.off(t,r),e.apply(n,arguments)}
  function i (line 25) | function i(t,e,n,o,r){var i=function(e,n,t,o){return function(t){t.deleg...
  function a (line 25) | function a(t,e){for(var n=0;n<e.length;n++){var o=e[n];o.enumerable=o.en...
  function c (line 25) | function c(t){!function(t,e){if(!(t instanceof e))throw new TypeError("C...
  function y (line 25) | function y(t,e){for(var n=0;n<e.length;n++){var o=e[n];o.enumerable=o.en...
  function v (line 25) | function v(t,e){!function(t,e){if(!(t instanceof e))throw new TypeError(...
  function b (line 25) | function b(t,e){var n="data-clipboard-"+t;if(e.hasAttribute(n))return e....
  function r (line 25) | function r(t){if(o[t])return o[t].exports;var e=o[t]={i:t,l:!1,exports:{...

FILE: docs/static_site/src/assets/js/options.js
  function label (line 29) | function label(lbl) {
  function urlSearchParams (line 35) | function urlSearchParams(searchString) {
  function is_a_match (line 48) | function is_a_match(elem, text) {
  function setSelects (line 55) | function setSelects(urlParams, dontPushState) {
  function showContent (line 102) | function showContent() {
  function setContent (line 113) | function setContent() {

FILE: docs/tutorial_utils/vision/cnn_visualization/gradcam.py
  class ReluOp (line 29) | class ReluOp(mx.operator.CustomOp):
    method forward (line 38) | def forward(self, is_train, req, in_data, out_data, aux):
    method backward (line 43) | def backward(self, req, out_grad, in_data, out_data, in_grad, aux):
  function set_guided_backprop (line 62) | def set_guided_backprop(mode=True):
  class ReluProp (line 66) | class ReluProp(mx.operator.CustomOpProp):
    method __init__ (line 67) | def __init__(self):
    method infer_shape (line 70) | def infer_shape(self, in_shapes):
    method create_operator (line 75) | def create_operator(self, ctx, in_shapes, in_dtypes):
  class Activation (line 78) | class Activation(mx.gluon.HybridBlock):
    method set_guided_backprop (line 80) | def set_guided_backprop(mode=False):
    method __init__ (line 83) | def __init__(self, act_type, **kwargs):
    method forward (line 87) | def forward(self, x):
  class Conv2D (line 90) | class Conv2D(mx.gluon.HybridBlock):
    method __init__ (line 100) | def __init__(self, channels, kernel_size, strides=(1, 1), padding=(0, 0),
    method forward (line 110) | def forward(self, x):
  function set_capture_layer_name (line 118) | def set_capture_layer_name(name):
  function _get_grad (line 121) | def _get_grad(net, image, class_id=None, conv_layer_name=None, image_gra...
  function get_conv_out_grad (line 168) | def get_conv_out_grad(net, image, class_id=None, conv_layer_name=None):
  function get_image_grad (line 184) | def get_image_grad(net, image, class_id=None):
  function grad_to_image (line 198) | def grad_to_image(gradient):
  function get_cam (line 208) | def get_cam(imggrad, conv_out):
  function get_guided_grad_cam (line 220) | def get_guided_grad_cam(cam, imggrad):
  function get_img_heatmap (line 224) | def get_img_heatmap(orig_img, activation_map):
  function to_grayscale (line 233) | def to_grayscale(cv2im):
  function visualize (line 246) | def visualize(net, preprocessed_img, orig_img, conv_layer_name):

FILE: example/distributed_training-horovod/gluon_mnist.py
  function get_mnist_iterator (line 56) | def get_mnist_iterator(rank):
  function conv_nets (line 91) | def conv_nets():
  function evaluate (line 105) | def evaluate(model, data_iter, context):

FILE: example/distributed_training-horovod/resnet50_imagenet.py
  function get_data_rec (line 139) | def get_data_rec(rec_train, rec_train_idx, rec_val, rec_val_idx, batch_s...
  class SyntheticDataIter (line 202) | class SyntheticDataIter(DataIter):
    method __init__ (line 203) | def __init__(self, num_classes, data_shape, max_iter, dtype, ctx):
    method __iter__ (line 215) | def __iter__(self):
    method provide_data (line 219) | def provide_data(self):
    method provide_label (line 223) | def provide_label(self):
    method next (line 227) | def next(self):
    method __next__ (line 239) | def __next__(self):
    method reset (line 242) | def reset(self):
  function train_gluon (line 280) | def train_gluon():

FILE: example/distributed_training/cifar10_dist.py
  function transform (line 54) | def transform(data, label):
  class SplitSampler (line 58) | class SplitSampler(gluon.data.sampler.Sampler):
    method __init__ (line 70) | def __init__(self, length, num_parts=1, part_index=0):
    method __iter__ (line 78) | def __iter__(self):
    method __len__ (line 84) | def __len__(self):
  function evaluate_accuracy (line 110) | def evaluate_accuracy(data_iterator, network):
  function forward_backward (line 150) | def forward_backward(network, data, label):
  function train_batch (line 163) | def train_batch(batch_list, context, network, gluon_trainer):

FILE: example/distributed_training/cifar10_kvstore_hvd.py
  function transform (line 57) | def transform(data, label):
  function train (line 63) | def train(batch_list, context, network, gluon_trainer, metric):
  function evaluate (line 112) | def evaluate(data_iterator, network, context):
  class SplitSampler (line 148) | class SplitSampler(gluon.data.sampler.Sampler):
    method __init__ (line 161) | def __init__(self, length, num_parts=1, part_index=0):
    method __iter__ (line 169) | def __iter__(self):
    method __len__ (line 175) | def __len__(self):

FILE: example/extensions/lib_api/init_lib.cc
  function MXReturnValue (line 30) | MXReturnValue initialize(int version) {

FILE: example/extensions/lib_api/libtest.cc
  function main (line 38) | int main(void) {

FILE: example/extensions/lib_custom_op/gemm_lib.cc
  function gemm (line 32) | void gemm(const float* A,
  function transpose (line 49) | void transpose(const float* A, float* At, const unsigned n, const unsign...
  function MXReturnValue (line 62) | MXReturnValue forward(const std::unordered_map<std::string, std::string>...
  function MXReturnValue (line 96) | MXReturnValue backward(const std::unordered_map<std::string, std::string...
  function MXReturnValue (line 124) | MXReturnValue parseAttrs(const std::unordered_map<std::string, std::stri...
  function MXReturnValue (line 132) | MXReturnValue inferType(const std::unordered_map<std::string, std::strin...
  function MXReturnValue (line 151) | MXReturnValue inferShape(const std::unordered_map<std::string, std::stri...
  class MyStatefulGemm (line 186) | class MyStatefulGemm : public CustomStatefulOp {
    method MyStatefulGemm (line 188) | explicit MyStatefulGemm(int count, std::unordered_map<std::string, std...
    method MXReturnValue (line 195) | MXReturnValue Forward(std::vector<MXTensor>* inputs,
    method MXReturnValue (line 202) | MXReturnValue Backward(std::vector<MXTensor>* inputs,
  function MXReturnValue (line 213) | MXReturnValue createOpState(const std::unordered_map<std::string, std::s...
  function MXReturnValue (line 226) | MXReturnValue mutateInputs(const std::unordered_map<std::string, std::st...
  function MXReturnValue (line 239) | MXReturnValue initialize(int version) {

FILE: example/extensions/lib_custom_op/relu_lib.cc
  function MXReturnValue (line 30) | MXReturnValue parseAttrs(const std::unordered_map<std::string, std::stri...
  function MXReturnValue (line 38) | MXReturnValue inferType(const std::unordered_map<std::string, std::strin...
  function MXReturnValue (line 45) | MXReturnValue inferShape(const std::unordered_map<std::string, std::stri...
  function MXReturnValue (line 52) | MXReturnValue forwardCPU(const std::unordered_map<std::string, std::stri...
  function MXReturnValue (line 64) | MXReturnValue backwardCPU(const std::unordered_map<std::string, std::str...
  function MXReturnValue (line 89) | MXReturnValue MyStatefulReluCPU::Forward(std::vector<MXTensor>* inputs,
  function MXReturnValue (line 95) | MXReturnValue MyStatefulReluCPU::Backward(std::vector<MXTensor>* inputs,
  function MXReturnValue (line 104) | MXReturnValue MyStatefulReluGPU::Forward(std::vector<MXTensor>* inputs,
  function MXReturnValue (line 110) | MXReturnValue MyStatefulReluGPU::Backward(std::vector<MXTensor>* inputs,
  function MXReturnValue (line 116) | MXReturnValue createOpStateCPU(const std::unordered_map<std::string, std...
  function MXReturnValue (line 125) | MXReturnValue createOpStateGPU(const std::unordered_map<std::string, std...
  function MXReturnValue (line 141) | MXReturnValue noisyForwardCPU(const std::unordered_map<std::string, std:...
  function MXReturnValue (line 167) | MXReturnValue initialize(int version) {

FILE: example/extensions/lib_custom_op/relu_lib.h
  function class (line 36) | class MyStatefulReluCPU : public CustomStatefulOp {
  function class (line 51) | class MyStatefulReluGPU : public CustomStatefulOp {

FILE: example/extensions/lib_custom_op/transposecsr_lib.cc
  function transpose (line 31) | void transpose(MXTensor& src, MXTensor& dst, const OpResource& res) {
  function MXReturnValue (line 68) | MXReturnValue forward(const std::unordered_map<std::string, std::string>...
  function MXReturnValue (line 86) | MXReturnValue backward(const std::unordered_map<std::string, std::string...
  function MXReturnValue (line 93) | MXReturnValue parseAttrs(const std::unordered_map<std::string, std::stri...
  function MXReturnValue (line 101) | MXReturnValue inferType(const std::unordered_map<std::string, std::strin...
  function MXReturnValue (line 118) | MXReturnValue inferSType(const std::unordered_map<std::string, std::stri...
  function MXReturnValue (line 129) | MXReturnValue inferShape(const std::unordered_map<std::string, std::stri...
  class MyStatefulTransposeCSR (line 153) | class MyStatefulTransposeCSR : public CustomStatefulOp {
    method MyStatefulTransposeCSR (line 155) | explicit MyStatefulTransposeCSR(int count, std::unordered_map<std::str...
    method MXReturnValue (line 158) | MXReturnValue Forward(std::vector<MXTensor>* inputs,
    method MXReturnValue (line 165) | MXReturnValue Backward(std::vector<MXTensor>* inputs,
  function MXReturnValue (line 176) | MXReturnValue createOpState(const std::unordered_map<std::string, std::s...
  function MXReturnValue (line 196) | MXReturnValue initialize(int version) {

FILE: example/extensions/lib_custom_op/transposerowsp_lib.cc
  function transpose (line 31) | void transpose(MXTensor& src, MXTensor& dst, const OpResource& res) {
  function MXReturnValue (line 70) | MXReturnValue forward(const std::unordered_map<std::string, std::string>...
  function MXReturnValue (line 87) | MXReturnValue backward(const std::unordered_map<std::string, std::string...
  function MXReturnValue (line 94) | MXReturnValue parseAttrs(const std::unordered_map<std::string, std::stri...
  function MXReturnValue (line 102) | MXReturnValue inferType(const std::unordered_map<std::string, std::strin...
  function MXReturnValue (line 119) | MXReturnValue inferSType(const std::unordered_map<std::string, std::stri...
  function MXReturnValue (line 130) | MXReturnValue inferShape(const std::unordered_map<std::string, std::stri...
  class MyStatefulTransposeRowSP (line 154) | class MyStatefulTransposeRowSP : public CustomStatefulOp {
    method MyStatefulTransposeRowSP (line 156) | explicit MyStatefulTransposeRowSP(int count, std::unordered_map<std::s...
    method MXReturnValue (line 159) | MXReturnValue Forward(std::vector<MXTensor>* inputs,
    method MXReturnValue (line 166) | MXReturnValue Backward(std::vector<MXTensor>* inputs,
  function MXReturnValue (line 177) | MXReturnValue createOpState(const std::unordered_map<std::string, std::s...
  function MXReturnValue (line 198) | MXReturnValue initialize(int version) {

FILE: example/extensions/lib_external_ops/init_lib.cc
  function MXReturnValue (line 30) | MXReturnValue initialize(int version) {

FILE: example/extensions/lib_external_ops/min_ex-inl.h
  function namespace (line 35) | namespace mxnet {

FILE: example/extensions/lib_external_ops/min_ex.cc
  type mxnet (line 27) | namespace mxnet {
    type op (line 28) | namespace op {

FILE: example/extensions/lib_pass/pass_lib.cc
  function MXReturnValue (line 33) | MXReturnValue myPass(mxnet::ext::Graph* g,
  function MXReturnValue (line 44) | MXReturnValue initialize(int version) {

FILE: example/extensions/lib_pass/test_pass.py
  function test_model (line 50) | def test_model(pass_name):

FILE: example/extensions/lib_subgraph/subgraph_lib.cc
  function myLog (line 34) | void myLog(MXTensor* in, MXTensor* out) {
  function myExp (line 42) | void myExp(MXTensor* in, MXTensor* out) {
  function MXReturnValue (line 55) | MXReturnValue myExecutor(std::vector<MXTensor>* inputs,
  class MyStatefulOp (line 142) | class MyStatefulOp : public CustomStatefulOp {
    method MyStatefulOp (line 144) | explicit MyStatefulOp(std::string json, const std::unordered_map<std::...
    method MXReturnValue (line 152) | MXReturnValue Forward(std::vector<MXTensor>* inputs,
  function MXReturnValue (line 166) | MXReturnValue createOpState(const std::unordered_map<std::string, std::s...
  function MXReturnValue (line 187) | MXReturnValue mySupportedOps(const mxnet::ext::Graph* graph,
  function MXReturnValue (line 218) | MXReturnValue myReviewSubgraph(const mxnet::ext::Graph* subgraph,
  class MySelector (line 251) | class MySelector : public CustomOpSelector {
    method MySelector (line 253) | MySelector(const mxnet::ext::Graph* graph,
    method chooseNode (line 260) | bool chooseNode(int nodeID) {
    method Select (line 281) | bool Select(int nodeID) override {
    method SelectInput (line 284) | bool SelectInput(int nodeID, int input_nodeID) override {
    method SelectOutput (line 287) | bool SelectOutput(int nodeID, int output_nodeID) override {
    method Filter (line 290) | virtual void Filter(std::vector<int>& candidates, std::vector<int>& ke...
    method Reset (line 293) | void Reset() override {}
  function MXReturnValue (line 300) | MXReturnValue createSelector(const mxnet::ext::Graph* graph,
  function MXReturnValue (line 314) | MXReturnValue addInputPass(mxnet::ext::Graph* graph,
  function MXReturnValue (line 341) | MXReturnValue initialize(int version) {

FILE: example/extensions/lib_subgraph/test_subgraph.py
  function test (line 51) | def test(backend):

FILE: example/gluon/actor_critic/actor_critic.py
  class Policy (line 47) | class Policy(gluon.Block):
    method __init__ (line 48) | def __init__(self, **kwargs):
    method forward (line 54) | def forward(self, x):

FILE: example/gluon/data.py
  function get_cifar10_iterator (line 33) | def get_cifar10_iterator(batch_size, data_shape, resize=-1, num_parts=1,...
  function get_imagenet_transforms (line 60) | def get_imagenet_transforms(data_shape=224, dtype='float32'):
  function get_imagenet_iterator (line 76) | def get_imagenet_iterator(root, batch_size, num_workers, data_shape=224,...
  function get_caltech101_data (line 93) | def get_caltech101_data():
  function get_caltech101_iterator (line 110) | def get_caltech101_iterator(batch_size, num_workers, dtype):
  class DummyIter (line 128) | class DummyIter(mx.io.DataIter):
    method __init__ (line 129) | def __init__(self, batch_size, data_shape, batches = 100):
    method next (line 140) | def next(self):
  function dummy_iterator (line 148) | def dummy_iterator(batch_size, data_shape):
  class ImagePairIter (line 151) | class ImagePairIter(mx.io.DataIter):
    method __init__ (line 152) | def __init__(self, path, data_shape, label_shape, batch_size=64, flag=...
    method next (line 166) | def next(self):
    method reset (line 195) | def reset(self):

FILE: example/gluon/house_prices/kaggle_k_fold_cross_validation.py
  function get_rmse_log (line 66) | def get_rmse_log(net, X_train, y_train):
  function get_net (line 73) | def get_net():
  function train (line 81) | def train(net, X_train, y_train, epochs, verbose_epoch, learning_rate,
  function k_fold_cross_valid (line 103) | def k_fold_cross_valid(k, epochs, verbose_epoch, X_train, y_train,
  function learn (line 150) | def learn(epochs, verbose_epoch, X_train, y_train, test, learning_rate,

FILE: example/gluon/image_classification.py
  function get_model (line 117) | def get_model(model, device, opt):
  function get_data_iters (line 138) | def get_data_iters(dataset, batch_size, opt):
  function test (line 162) | def test(device, val_data):
  function update_learning_rate (line 174) | def update_learning_rate(lr, trainer, epoch, ratio, steps):
  function save_checkpoint (line 180) | def save_checkpoint(epoch, top1, best_acc):
  function train (line 191) | def train(opt, device):
  function main (line 256) | def main():

FILE: example/gluon/mnist/mnist.py
  function transformer (line 57) | def transformer(data, label):
  function test (line 71) | def test(ctx):
  function train (line 82) | def train(epochs, ctx):

FILE: example/gluon/super_resolution/super_resolution.py
  function get_dataset (line 70) | def get_dataset(prefetch=False):
  class SuperResolutionNet (line 139) | class SuperResolutionNet(gluon.HybridBlock):
    method __init__ (line 140) | def __init__(self, upscale_factor):
    method forward (line 148) | def forward(self, x):
  function test (line 159) | def test(device):
  function train (line 177) | def train(epoch, device):
  function resolve (line 208) | def resolve(device):

FILE: example/profiler/profiler_imageiter.py
  function run_imageiter (line 26) | def run_imageiter(path_rec, n, batch_size=32):

FILE: example/profiler/profiler_matmul.py
  function parse_args (line 24) | def parse_args():

FILE: example/profiler/profiler_ndarray.py
  function _np_reduce (line 24) | def _np_reduce(dat, axis, keepdims, numpy_reduce_func):
  function reldiff (line 40) | def reldiff(a, b):
  function same (line 47) | def same(a, b):
  function check_with_uniform (line 51) | def check_with_uniform(uf, arg_shapes, dim=None, npuf=None, rmin=-10, ty...
  function random_ndarray (line 80) | def random_ndarray(dim):
  function test_ndarray_elementwise (line 86) | def test_ndarray_elementwise():
  function test_ndarray_negate (line 104) | def test_ndarray_negate():
  function test_ndarray_choose (line 116) | def test_ndarray_choose():
  function test_ndarray_fill (line 127) | def test_ndarray_fill():
  function test_ndarray_onehot (line 142) | def test_ndarray_onehot():
  function test_ndarray_copy (line 155) | def test_ndarray_copy():
  function test_ndarray_scalar (line 161) | def test_ndarray_scalar():
  function test_ndarray_pickle (line 176) | def test_ndarray_pickle():
  function test_ndarray_saveload (line 192) | def test_ndarray_saveload():
  function test_ndarray_slice (line 216) | def test_ndarray_slice():
  function test_ndarray_slice_along_axis (line 226) | def test_ndarray_slice_along_axis():
  function test_clip (line 238) | def test_clip():
  function test_dot (line 248) | def test_dot():
  function test_reduce (line 258) | def test_reduce():
  function test_broadcast (line 294) | def test_broadcast():

FILE: example/quantization/imagenet_gen_qsym_onednn.py
  function download_calib_dataset (line 37) | def download_calib_dataset(dataset_url, calib_dataset, logger=None):
  function get_from_gluon (line 43) | def get_from_gluon(model_name, classes=1000, logger=None):
  function regex_find_excluded_symbols (line 53) | def regex_find_excluded_symbols(patterns_dict, model_name):
  function get_exclude_symbols (line 60) | def get_exclude_symbols(model_name, exclude_first_conv):

FILE: example/quantization/imagenet_inference.py
  function download_dataset (line 29) | def download_dataset(dataset_url, dataset_dir, logger=None):
  function score (line 35) | def score(symblock, data, ctx, max_num_examples, skip_num_batches, logge...
  function initialize_block_params (line 64) | def initialize_block_params(block, initializer):
  function benchmark_score (line 72) | def benchmark_score(symblock, ctx, batch_size, warmup_batches, num_batch...

FILE: example/quantization_inc/custom_strategy.py
  function calc_approx_error (line 25) | def calc_approx_error(expected_tensor: np.ndarray, observed_tensor: np.n...
  function get_approx_errors (line 37) | def get_approx_errors(expected_tensors, observed_tensors):
  class MyCustomTuneStrategy (line 50) | class MyCustomTuneStrategy(TuneStrategy):
    method __init__ (line 52) | def __init__(self, model, conf, q_dataloader, q_func=None,
    method get_qtensors (line 65) | def get_qtensors(self, quant_cfg, node_list):
    method next_tune_cfg (line 73) | def next_tune_cfg(self):
    method bayesian_params_to_tune_configs (line 156) | def bayesian_params_to_tune_configs(self, params):
    method bayesian_configurations (line 168) | def bayesian_configurations(self, cfg_base, params_base):

FILE: example/quantization_inc/resnet_measurement.py
  function test_accuracy (line 24) | def test_accuracy(net, data_loader, description):

FILE: example/quantization_inc/resnet_mse.py
  function eval_func (line 45) | def eval_func(model):

FILE: example/quantization_inc/resnet_tuning.py
  function save_model (line 30) | def save_model(net, data_loader, description, time_spend):
  function eval_func (line 98) | def eval_func(model):

FILE: example/recommenders/matrix_fact.py
  function evaluate_network (line 28) | def evaluate_network(network, data_iterator, ctx):
  function train (line 40) | def train(network, train_data, test_data, epochs, learning_rate=0.01, op...

FILE: example/recommenders/movielens_data.py
  function load_mldataset (line 25) | def load_mldataset(filename):
  function ensure_local_data (line 45) | def ensure_local_data(prefix):
  function get_dataset (line 56) | def get_dataset(prefix='ml-100k'):
  function max_id (line 63) | def max_id(fname):

FILE: include/mxnet/api_registry.h
  function namespace (line 33) | namespace mxnet {

FILE: include/mxnet/base.h
  function namespace (line 75) | namespace mxnet {
  function operator (line 132) | inline bool operator==(const Context& b) const {
  function operator (line 140) | inline bool operator!=(const Context& b) const {
  function Save (line 147) | inline void Save(dmlc::Stream* strm) const {
  function Load (line 156) | inline bool Load(dmlc::Stream* strm) {
  function class (line 234) | class GPUAuxStream {
  function PreAuxStreamUseSync (line 266) | void PreAuxStreamUseSync() {
  function PostAuxStreamUseSync (line 274) | void PostAuxStreamUseSync() {
  function StreamSync (line 289) | static void StreamSync(mshadow::Stream<gpu>* s1, mshadow::Stream<gpu>* s...
  function class (line 308) | class SyncedGPUAuxStream {
  type RunContext (line 343) | struct RunContext {
  function namespace (line 384) | namespace mxnet {
  function namespace (line 550) | namespace std {

FILE: include/mxnet/c_api.h
  type mx_uint (line 65) | typedef uint32_t mx_uint;
  type mx_float (line 67) | typedef float mx_float;
  type dim_t (line 69) | typedef int64_t dim_t;
  type NativeOpInfo (line 129) | struct NativeOpInfo {
  type NDArrayOpInfo (line 143) | struct NDArrayOpInfo {
  type MXCallbackList (line 161) | struct MXCallbackList {
  type LibFeature (line 167) | struct LibFeature {
  type CustomOpCallbacks (line 172) | enum CustomOpCallbacks { kCustomOpDelete, kCustomOpForward, kCustomOpBac...
  type CustomOpPropCallbacks (line 174) | enum CustomOpPropCallbacks {
  type MXCallbackList (line 216) | struct MXCallbackList
  type MXCallbackList (line 222) | struct MXCallbackList
  type CustomFunctionCallbacks (line 224) | enum CustomFunctionCallbacks { kCustomFunctionBackward, kCustomFunctionD...
  type LibFeature (line 263) | struct LibFeature
  type OtherOptionEntity (line 575) | struct OtherOptionEntity {
  type OtherOptionSpace (line 579) | struct OtherOptionSpace {
  type ConfigSpace (line 584) | struct ConfigSpace {
  type ConfigSpaces (line 593) | typedef struct ConfigSpaces {
  type MXCallbackList (line 2922) | struct MXCallbackList

FILE: include/mxnet/c_api_error.h
  function namespace (line 72) | namespace mxnet {

FILE: include/mxnet/engine.h
  function namespace (line 36) | namespace mxnet {
  function class (line 169) | class CallbackOnComplete {
  function FnProperty (line 191) | enum class FnProperty {

FILE: include/mxnet/executor.h
  function virtual (line 84) | virtual void Print(std::ostream& os) const {}

FILE: include/mxnet/expr_operator.h
  function namespace (line 34) | namespace mxnet {

FILE: include/mxnet/graph_attr_types.h
  function namespace (line 30) | namespace mxnet {

FILE: include/mxnet/imperative.h
  type class (line 40) | enum class
  type NumpyShape (line 57) | enum NumpyShape { Off, ThreadLocalOn, GlobalOn }
  type NumpyShape (line 58) | typedef NumpyShape NumpyDefaultDtype;
  function Clear (line 76) | static void Clear(const nnvm::ObjectPtr& node) {
  function AGInfo (line 85) | static AGInfo& Get(const nnvm::ObjectPtr& node) {
  function AGInfo (line 89) | static AGInfo& Create(const nnvm::ObjectPtr& node) {
  function IsNone (line 94) | static bool IsNone(const NDArray& arr) {
  function IsVariable (line 98) | static bool IsVariable(const nnvm::ObjectPtr& node) {
  function class (line 105) | class DCInfo {
  function set_is_training (line 179) | bool set_is_training(bool is_train) {
  function set_is_recording (line 189) | bool set_is_recording(bool is_recording) {
  function set_is_deferred_compute (line 199) | bool set_is_deferred_compute(bool is_deferred_compute) {
  function set_is_np_shape (line 214) | bool set_is_np_shape(int is_np_shape) {
  function set_is_np_default_dtype (line 241) | bool set_is_np_default_dtype(bool is_np_default_dtype) {
  function OptConstraint (line 255) | OptConstraint set_opt_constraints(OptConstraint constraints) {
  function PreferBulkExecInference (line 308) | static bool PreferBulkExecInference() {
  function PreferBulkExecTrain (line 312) | static bool PreferBulkExecTrain() {
  function BulkExecMaxNodeTrainFwd (line 316) | static int BulkExecMaxNodeTrainFwd() {
  function BulkExecMaxNodeTrainBwd (line 321) | static int BulkExecMaxNodeTrainBwd() {
  function is_np_shape_global_ (line 357) | bool is_np_shape_global_{false};

FILE: include/mxnet/io.h
  function namespace (line 36) | namespace mxnet {

FILE: include/mxnet/ir/expr.h
  function class (line 40) | class BaseExprNode : public Object {
  function class (line 50) | class BaseExpr : public ObjectRef {
  function class (line 75) | class PrimExprNode : public BaseExprNode {
  function class (line 101) | class PrimExpr : public BaseExpr {
  function class (line 152) | class IntImm : public PrimExpr {
  function class (line 183) | class FloatImmNode : public PrimExprNode {

FILE: include/mxnet/kvstore.h
  type class (line 48) | enum class
  function class (line 56) | class KVStore {

FILE: include/mxnet/lib_api.h
  type DLDeviceType (line 101) | typedef enum {
  type DLContext (line 132) | typedef struct {
  type DLDataTypeCode (line 142) | typedef enum {
  type DLDataType (line 156) | typedef struct {
  type DLTensor (line 174) | typedef struct {
  function namespace (line 216) | namespace mxnet {

FILE: include/mxnet/libinfo.h
  function namespace (line 131) | namespace mxnet {

FILE: include/mxnet/ndarray.h
  function namespace (line 47) | namespace dnnl {
  function namespace (line 51) | namespace mxnet {
  function InitDetached (line 196) | void InitDetached(const NDArray* src) {
  function ReInit (line 200) | inline void ReInit() {
  function IsSame (line 236) | inline bool IsSame(const NDArray& other) const {
  function mxnet (line 264) | inline const mxnet::TShape& aux_shape(size_t index) const {
  function set_aux_shape (line 289) | inline void set_aux_shape(size_t index, const mxnet::TShape& shape) const {
  function NDArray (line 307) | NDArray grad() const;
  function aux_type (line 338) | inline int aux_type(size_t i) const {
  function WaitToWrite (line 403) | void WaitToWrite() const;
  function NDArray (line 586) | inline NDArray AsArray(const mxnet::TShape& shape, int dtype) const {
  function InitAsArray (line 599) | inline void InitAsArray(const NDArray& src, const mxnet::TShape& shape, ...
  function SparseUpdateChunk (line 639) | inline void SparseUpdateChunk(const NDArray& arr) const {
  function NDArrayFunctionReg (line 1396) | inline NDArrayFunctionReg& set_function(void (*fsetvalue)(const real_t& ...
  function NDArrayFunctionReg (line 1414) | inline NDArrayFunctionReg& set_function(
  function NDArrayFunctionReg (line 1438) | inline NDArrayFunctionReg& set_function(void (*fbinary)(const NDArray& lhs,
  function NDArrayFunctionReg (line 1462) | inline NDArrayFunctionReg& set_function(void (*fscalar)(const NDArray& lhs,
  function NDArrayFunctionReg (line 1485) | inline NDArrayFunctionReg& set_function(void (*funary)(const NDArray& sr...
  function NDArrayFunctionReg (line 1504) | inline NDArrayFunctionReg& set_function(
  function NDArrayFunctionReg (line 1528) | inline NDArrayFunctionReg& set_num_use_vars(unsigned n) {
  function NDArrayFunctionReg (line 1537) | inline NDArrayFunctionReg& set_num_mutate_vars(unsigned n) {
  function NDArrayFunctionReg (line 1546) | inline NDArrayFunctionReg& set_num_scalars(unsigned n) {
  function NDArrayFunctionReg (line 1555) | inline NDArrayFunctionReg& set_type_mask(int tmask) {
  function namespace (line 1577) | namespace dmlc {

FILE: include/mxnet/node/container.h
  function class (line 39) | class ArrayNode : public Object {

FILE: include/mxnet/node/node.h
  function namespace (line 47) | namespace mxnet {

FILE: include/mxnet/op_attr_types.h
  function namespace (line 40) | namespace mxnet {
  type class (line 98) | enum class
  type class (line 122) | enum class
  type class (line 135) | enum class
  function class (line 148) | class OpStatePtr {

FILE: include/mxnet/operator.h
  function namespace (line 41) | namespace mxnet {
  function class (line 127) | class OperatorProperty {
  function virtual (line 221) | virtual bool InferType(std::vector<int>* in_type,
  function virtual (line 261) | virtual Operator* CreateOperatorEx(Context ctx,
  function virtual (line 290) | virtual std::vector<ResourceRequest> ForwardResource(const mxnet::ShapeV...
  function virtual (line 300) | virtual std::vector<ResourceRequest> BackwardResource(const mxnet::Shape...
  function virtual (line 325) | virtual std::vector<int> DeclareBackwardDependency(const std::vector<int...
  type std (line 450) | typedef std::function<OperatorProperty*()> OperatorPropertyFactory;
  type OperatorPropertyReg (line 454) | struct OperatorPropertyReg
  function OperatorPropertyReg (line 468) | inline OperatorPropertyReg& set_key_var_num_args(const std::string& key)...
  function OperatorPropertyReg (line 475) | inline OperatorPropertyReg& check_name() {

FILE: include/mxnet/operator_util.h
  function namespace (line 49) | namespace mxnet {

FILE: include/mxnet/random_generator.h
  function namespace (line 36) | namespace mxnet {

FILE: include/mxnet/resource.h
  function namespace (line 33) | namespace mxnet {
  function namespace (line 65) | namespace {
  type Resource (line 90) | struct Resource {
  function class (line 239) | class ResourceManager {

FILE: include/mxnet/rtc.h
  function namespace (line 35) | namespace mxnet {

FILE: include/mxnet/runtime/c_runtime_api.h
  type MXNetTypeCode (line 41) | typedef enum {
  type MXNetValue (line 72) | typedef union {
  type MXNetByteArray (line 85) | typedef struct {

FILE: include/mxnet/runtime/container.h
  function namespace (line 36) | namespace mxnet {
  function class (line 209) | class ADT : public ObjectRef {

FILE: include/mxnet/runtime/container_ext.h
  function namespace (line 39) | namespace mxnet {
  type mxnet (line 677) | struct mxnet
  function memncmp (line 878) | inline int String::memncmp(const char* lhs, const char* rhs, size_t lhs_...
  function const (line 897) | inline size_t ObjectRefHash::operator()(const ObjectRef& a) const {
  function const (line 904) | inline bool ObjectRefEqual::operator()(const ObjectRef& a, const ObjectR...

FILE: include/mxnet/runtime/data_type.h
  function namespace (line 32) | namespace mxnet {

FILE: include/mxnet/runtime/ffi_helper.h
  function ObjectRef (line 44) | inline ObjectRef CreateEllipsis() {
  function SliceNoneValue (line 80) | int64_t inline SliceNoneValue() {
  function class (line 109) | class Float : public ObjectRef {

FILE: include/mxnet/runtime/memory.h
  function namespace (line 32) | namespace mxnet {

FILE: include/mxnet/runtime/ndarray.h
  function namespace (line 28) | namespace mxnet {

FILE: include/mxnet/runtime/ndarray_handle.h
  function class (line 40) | class NDArrayHandle : public ObjectRef {

FILE: include/mxnet/runtime/object.h
  function namespace (line 47) | namespace mxnet {
  function class (line 500) | class ObjectRef {
  function const (line 620) | struct ObjectHash {
  function const (line 632) | struct ObjectEqual {
  function IncRef (line 727) | inline void Object::IncRef() {
  function DecRef (line 731) | inline void Object::DecRef() {
  function IncRef (line 746) | inline void Object::IncRef() {
  function DecRef (line 750) | inline void Object::DecRef() {
  function IsInstance (line 765) | bool Object::IsInstance() const {
  function ObjectType (line 804) | const ObjectType* ObjectRef::as() const {
  function RefType (line 813) | RefType GetRef(const ObjType* ptr) {
  function SubRef (line 830) | SubRef Downcast(BaseRef ref) {

FILE: include/mxnet/runtime/packed_func.h
  function namespace (line 52) | namespace mxnet {
  function class (line 322) | class MXNetArgs {
  function Check (line 385) | static bool Check(const Object* ptr) {
  function std (line 391) | static std::string TypeName() {
  function class (line 401) | class MXNetPODValue_ {
  function type_code_ (line 465) | type_code_(kNull) {}
  function class (line 480) | class MXNetArgValue : public MXNetPODValue_ {
  function operator (line 526) | operator MXNetDataType() const {
  function class (line 555) | class MXNetRetValue : public MXNetPODValue_ {
  function MXNetPODValue_ (line 582) | MXNetRetValue(const MXNetRetValue& other) : MXNetPODValue_() {
  function operator (line 586) | operator std::string() const {
  function operator (line 600) | operator MXNetDataType() const {
  function MoveToCHost (line 714) | void MoveToCHost(MXNetValue* ret_value, int* ret_type_code) {
  function SwitchToPOD (line 759) | void SwitchToPOD(int type_code) {
  function SwitchToObject (line 775) | void SwitchToObject(int type_code, ObjectPtr<Object> other) {
  function Clear (line 786) | void Clear() {
  function DLDataType (line 805) | inline DLDataType String2DLDataType(std::string s) {
  function String2MXNetTypeWithBool (line 881) | inline int String2MXNetTypeWithBool(const std::string& s) {
  function String2MXNetType (line 915) | inline int String2MXNetType(const std::string& s) {
  function MXNetArgValue (line 971) | inline MXNetArgValue MXNetArgs::operator[](int i) const {
  function CallPacked (line 981) | inline void PackedFunc::CallPacked(MXNetArgs args, MXNetRetValue* rv) co...
  function namespace (line 990) | namespace detail {
  function class (line 1013) | class MXNetArgsSetter {
  function const (line 1022) | void operator()(size_t i, uint64_t value) const {
  function const (line 1027) | void operator()(size_t i, double value) const {
  function const (line 1031) | void operator()(size_t i, std::nullptr_t value) const {
  function const (line 1035) | void operator()(size_t i, const MXNetArgValue& value) const {
  function const (line 1039) | void operator()(size_t i, void* value) const {
  function const (line 1043) | void operator()(size_t i, const char* value) const {
  function const (line 1050) | void operator()(size_t i, const std::string& value) const {  // NOLINT(*)
  function const (line 1054) | void operator()(size_t i, DLDataType value) const {
  function const (line 1058) | void operator()(size_t i, MXNetDataType dtype) const {
  function const (line 1061) | void operator()(size_t i, const MXNetByteArray& value) const {  // NOLIN...
  function const (line 1069) | void operator()(size_t i, const ObjectRef& value) const {  // NOLINT(*)
  function const (line 1077) | void operator()(size_t i, const MXNetRetValue& value) const {  // NOLINT(*)
  function MXNetRetValue (line 1096) | MXNetRetValue PackedFunc::operator()(Args&&... args) const {
  function namespace (line 1107) | namespace detail {
  function run (line 1134) | void run(const F& f,
  function unpack_call (line 1143) | void unpack_call(const F& f, const MXNetArgs& args, MXNetRetValue* rv) {
  function R (line 1148) | R call_packed(const PackedFunc& pf, Args&&... args) {
  function R (line 1155) | inline R run(const PackedFunc& pf, Args&&... args) {
  function void (line 1161) | struct typed_packed_call_dispatcher<void> {
  function packed_ (line 1170) | packed_(packed) {}
  function namespace (line 1194) | namespace detail {
  function TObjectRef (line 1221) | static TObjectRef From(const MXNetArgValue& val) {
  function TObjectRef (line 1229) | static TObjectRef From(const MXNetRetValue& val) {
  function String (line 1235) | struct PackedFuncValueConverter<::mxnet::runtime::String> {
  function IsObjectRef (line 1283) | bool MXNetPODValue_::IsObjectRef() const {

FILE: include/mxnet/runtime/py_arg.h
  function namespace (line 26) | namespace mxnet {

FILE: include/mxnet/runtime/registry.h
  function namespace (line 51) | namespace mxnet {

FILE: include/mxnet/storage.h
  type SyncObj (line 45) | struct SyncObj {
  type Handle (line 56) | struct Handle {
  function size (line 64) | size_t size{0}
  function shared_pid (line 72) | int shared_pid{-1};

FILE: include/mxnet/tensor_blob.h
  function namespace (line 39) | namespace mxnet {
  function CheckContiguous (line 186) | inline bool CheckContiguous(void) const {
  function TBlob (line 194) | inline TBlob reshape(const mxnet::TShape& shape) const {
  function ndim (line 231) | inline int ndim(void) const {
  function index_t (line 240) | inline index_t size(index_t idx) const {
  function Size (line 244) | inline size_t Size(void) const {
  function DType (line 249) | DType* dptr() const {
  type FieldEntryBase (line 488) | typedef FieldEntryBase<FieldEntry<mxnet::TShape>, mxnet::TShape> Parent;
  function virtual (line 490) | virtual void Check(void* head) const {
  function FieldEntry (line 510) | inline FieldEntry<mxnet::TShape>& enforce_nonzero() {
  function FieldEntry (line 514) | inline FieldEntry<mxnet::TShape>& set_expect_ndim(int ndim) {

FILE: include/mxnet/tuple.h
  function namespace (line 41) | namespace mxnet {
  function explicit (line 120) | inline explicit Tuple(const runtime::ObjectRef& src) {
  function assign (line 136) | void assign(RandomAccessIterator begin, RandomAccessIterator end) {
  function swap (line 145) | inline void swap(Tuple<ValueType>& other) {  // NOLINT(*)
  function s (line 186) | inline bool operator==(const Tuple<ValueType>& s) const {
  function s (line 197) | inline bool operator!=(const Tuple<ValueType>& s) const {
  function ValueType (line 201) | inline const ValueType* begin() const {
  function ValueType (line 205) | inline ValueType* begin() {
  function ValueType (line 209) | inline const ValueType* end() const {
  function ValueType (line 213) | inline ValueType* end() {
  function ValueType (line 225) | inline ValueType& operator[](int i) {
  function ValueType (line 239) | inline const ValueType& operator[](int i) const {
  function Save (line 252) | inline void Save(dmlc::JSONWriter* writer) const {
  function Load (line 260) | inline void Load(dmlc::JSONReader* reader) {
  function ndim_ (line 392) | int ndim_{0}
  function num_heap_allocated_ (line 394) | int num_heap_allocated_{0}
  function ValueType (line 398) | ValueType* data_heap_{nullptr};
  function ndim_is_known (line 416) | inline bool ndim_is_known(const int ndim) {
  function dim_size_is_known (line 422) | inline bool dim_size_is_known(const dim_t dim_size) {
  function class (line 440) | class TShape : public Tuple<dim_t> {
  function explicit (line 499) | inline explicit TShape(const ObjectRef& src) : Tuple(src) {}
  function Size (line 523) | inline size_t Size() const {
  function ProdShape (line 538) | inline size_t ProdShape(int dimstart, int dimend) const {
  function dim_t (line 552) | inline const dim_t* data() const {
  function dim_t (line 556) | inline dim_t* data() {
  function mshadow (line 599) | inline mshadow::Shape<2> FlatTo2D(void) const {
  function mshadow (line 619) | inline mshadow::Shape<3> FlatTo3D(int axis_begin, int axis_end) const {
  function mshadow (line 646) | inline mshadow::Shape<3> FlatTo3D(int axis) const {
  function operator (line 649) | inline bool operator==(const TShape& s) const {
  function operator (line 654) | inline bool operator!=(const TShape& s) const {
  function ndim_is_known (line 686) | inline bool ndim_is_known(const TShape& x) {
  function dim_size_is_known (line 691) | inline bool dim_size_is_known(const TShape& x, const int idx) {
  function shape_is_known (line 699) | inline bool shape_is_known(const TShape& x) {
  function shape_is_known (line 709) | inline bool shape_is_known(const std::vector<TShape>& shapes) {
  function DstIter (line 719) | DstIter ShapeTypeCast(const SrcIter begin, const SrcIter end, DstIter ds...
  function TShape (line 728) | TShape ShapeTypeCast(const SrcIter begin, const SrcIter end) {
  function namespace (line 771) | namespace std {
  function namespace (line 801) | namespace dmlc {
  function namespace (line 816) | namespace mxnet {

FILE: plugin/opencv/cv_api.cc
  function get_jpeg_size (line 36) | bool get_jpeg_size(const unsigned char* data, mx_uint data_size, mx_uint...
  function get_png_size (line 74) | bool get_png_size(const unsigned char* data, mx_uint data_size, mx_uint*...
  function MXNET_DLL (line 86) | MXNET_DLL int MXCVImdecode(const unsigned char* img,
  function MXNET_DLL (line 124) | MXNET_DLL int MXCVResize(NDArrayHandle src,
  function MXNET_DLL (line 156) | MXNET_DLL int MXCVcopyMakeBorder(NDArrayHandle src,

FILE: plugin/opencv/opencv.py
  function imdecode (line 29) | def imdecode(str_img, flag=1):
  function resize (line 51) | def resize(src, size, interpolation=cv2.INTER_LINEAR):
  function copyMakeBorder (line 74) | def copyMakeBorder(src, top, bot, left, right, border_type=cv2.BORDER_CO...
  function scale_down (line 97) | def scale_down(src_size, size):
  function fixed_crop (line 107) | def fixed_crop(src, x0, y0, w, h, size=None, interpolation=cv2.INTER_CUB...
  function random_crop (line 114) | def random_crop(src, size):
  function color_normalize (line 125) | def color_normalize(src, mean, std):
  function random_size_crop (line 131) | def random_size_crop(src, size, min_area=0.25, ratio=(3.0/4.0, 4.0/3.0)):
  class ImageListIter (line 155) | class ImageListIter(mx.io.DataIter):
    method __init__ (line 157) | def __init__(self, root, flist, batch_size, size, mean=None):
    method reset (line 169) | def reset(self):
    method next (line 173) | def next(self):

FILE: plugin/sframe/iter_sframe.cc
  type mxnet (line 44) | namespace mxnet {
    type io (line 45) | namespace io {
      type SFrameParam (line 47) | struct SFrameParam : public dmlc::Parameter<SFrameParam> {
        method DMLC_DECLARE_PARAMETER (line 54) | DMLC_DECLARE_PARAMETER(SFrameParam) {
      class SFrameIterBase (line 69) | class SFrameIterBase : public IIterator<DataInst> {
        method SFrameIterBase (line 71) | SFrameIterBase() {}
        method Init (line 73) | void Init(const std::vector<std::pair<std::string, std::string> >&...
        method BeforeFirst (line 82) | virtual void BeforeFirst() {
        method DataInst (line 88) | virtual const DataInst& Value(void) const {
        method Copy_ (line 113) | void Copy_(mshadow::Tensor<cpu, dim> tensor, const graphlab::flex_...
      class SFrameImageIter (line 123) | class SFrameImageIter : public SFrameIterBase {
        method SFrameImageIter (line 125) | SFrameImageIter() : augmenter_(new ImageAugmenter()), prnd_(new co...
        method Init (line 127) | void Init(const std::vector<std::pair<std::string, std::string> >&...
        method Next (line 133) | bool Next(void) override {
      class SFrameDataIter (line 184) | class SFrameDataIter : public SFrameIterBase {
        method Next (line 186) | bool Next() override {

FILE: plugin/torch/torch_base.cc
  type mxnet (line 27) | namespace mxnet {
    function TorchState (line 47) | TorchState* TorchState::ThreadSharedLuaState() {

FILE: plugin/torch/torch_base.h
  function namespace (line 47) | namespace mxnet {

FILE: plugin/torch/torch_criterion-inl.h
  function namespace (line 40) | namespace mxnet {

FILE: plugin/torch/torch_criterion.cc
  type mxnet (line 28) | namespace mxnet {
    type op (line 29) | namespace op {
      function Operator (line 31) | Operator* CreateOp<cpu>(TorchCriterionParam param) {
      function Operator (line 36) | Operator* TorchCriterionProp::CreateOperator(Context ctx) const {

FILE: plugin/torch/torch_function.cc
  type mxnet (line 27) | namespace mxnet {
    type TorchMMShape (line 92) | struct TorchMMShape {
      method GetShape (line 93) | static std::vector<mshadow::TShape> GetShape(NDArray** u,
    type TorchMVShape (line 108) | struct TorchMVShape {
      method GetShape (line 109) | static std::vector<mshadow::TShape> GetShape(NDArray** u,
    type TorchBMMShape (line 124) | struct TorchBMMShape {
      method GetShape (line 125) | static std::vector<mshadow::TShape> GetShape(NDArray** u,
    type TorchGERShape (line 141) | struct TorchGERShape {
      method GetShape (line 142) | static std::vector<mshadow::TShape> GetShape(NDArray** u,

FILE: plugin/torch/torch_function.h
  function namespace (line 37) | namespace mxnet {
  type TorchFirstShape (line 174) | struct TorchFirstShape {

FILE: plugin/torch/torch_module-inl.h
  function namespace (line 40) | namespace mxnet {

FILE: plugin/torch/torch_module.cc
  type mxnet (line 28) | namespace mxnet {
    type op (line 29) | namespace op {
      function Operator (line 31) | Operator* CreateOp<cpu>(TorchModuleParam param, TorchState* torchSta...
      function Operator (line 36) | Operator* TorchModuleProp::CreateOperator(Context ctx) const {

FILE: plugin/warpctc/warpctc-inl.h
  function namespace (line 41) | namespace mxnet {

FILE: plugin/warpctc/warpctc.cc
  type mxnet (line 29) | namespace mxnet {
    type op (line 30) | namespace op {
      function Operator (line 32) | Operator* CreateOp<cpu>(WarpCTCParam param) {
      function Operator (line 36) | Operator* WarpCTCProp::CreateOperator(Context ctx) const {

FILE: python/mxnet/_ctypes/cached_op.py
  function _monitor_callback_wrapper (line 33) | def _monitor_callback_wrapper(callback):
  class CachedOp (line 40) | class CachedOp(object):
    method __init__ (line 44) | def __init__(self, sym, flags=(), thread_safe=False):
    method __del__ (line 57) | def __del__(self):
    method get_optimized_symbol (line 60) | def get_optimized_symbol(self):
    method __call__ (line 73) | def __call__(self, *args, **kwargs):
    method _register_op_hook (line 148) | def _register_op_hook(self, callback, monitor_all=False):

FILE: python/mxnet/_ctypes/ndarray.py
  class NDArrayBase (line 31) | class NDArrayBase(object):
    method __init__ (line 36) | def __init__(self, handle, writable=True):
    method __del__ (line 50) | def __del__(self):
    method __reduce__ (line 54) | def __reduce__(self):
  function _imperative_invoke (line 58) | def _imperative_invoke(handle, ndargs, keys, vals, out, is_np_op, output...

FILE: python/mxnet/_ctypes/space.py
  class COtherOptionEntity (line 26) | class COtherOptionEntity(ctypes.Structure):
  class COtherOptionSpace (line 31) | class COtherOptionSpace(ctypes.Structure):
  class CConfigSpace (line 37) | class CConfigSpace(ctypes.Structure):
  class CConfigSpaces (line 47) | class CConfigSpaces(ctypes.Structure):
  function c_other_option_entity (line 54) | def c_other_option_entity(x):
  function c_other_option_space (line 61) | def c_other_option_space(x):
  function c_config_space (line 70) | def c_config_space(x):
  function c_config_spaces (line 84) | def c_config_spaces(x):
  function _set_tvm_op_config (line 93) | def _set_tvm_op_config(x):

FILE: python/mxnet/_ctypes/symbol.py
  class SymbolBase (line 32) | class SymbolBase(object):
    method __init__ (line 36) | def __init__(self, handle):
    method __del__ (line 47) | def __del__(self):
    method _compose (line 51) | def _compose(self, *args, **kwargs):
    method _set_attr (line 93) | def _set_attr(self, **kwargs):
    method _set_handle (line 107) | def _set_handle(self, handle):
    method __reduce__ (line 111) | def __reduce__(self):
  function _set_symbol_class (line 115) | def _set_symbol_class(cls):
  function _set_np_symbol_class (line 121) | def _set_np_symbol_class(cls):
  function _symbol_creator (line 127) | def _symbol_creator(handle, args, kwargs, keys, vals, name, is_np_op, ou...

FILE: python/mxnet/_deferred_compute.py
  function is_deferred_compute (line 27) | def is_deferred_compute():
  function set_deferred_compute (line 33) | def set_deferred_compute(state):
  function context (line 50) | def context(state=True):
  function get_symbol (line 64) | def get_symbol(output_arrays, *, sym_cls=Symbol):
  function set_variable (line 87) | def set_variable(arrays, variables):
  function clear (line 109) | def clear(arrays):

FILE: python/mxnet/_ffi/_ctypes/function.py
  function _make_packed_func (line 38) | def _make_packed_func(handle, is_global):
  function _get_global_func (line 45) | def _get_global_func(name, allow_missing=False):
  function _make_mxnet_args (line 56) | def _make_mxnet_args(args, temp_args):
  class FunctionBase (line 105) | class FunctionBase(object):
    method __init__ (line 109) | def __init__(self, handle, is_global):
    method __del__ (line 123) | def __del__(self):
    method __call__ (line 128) | def __call__(self, *args):
  function __init_handle_by_constructor__ (line 148) | def __init_handle_by_constructor__(fconstructor, args):
  function _set_class_packed_func (line 169) | def _set_class_packed_func(packed_func_class):
  function _set_node_generic (line 174) | def _set_node_generic(func_convert_to_node):

FILE: python/mxnet/_ffi/_ctypes/object.py
  function _set_class_object (line 34) | def _set_class_object(object_class):
  function _register_object (line 39) | def _register_object(index, cls):
  function _return_object (line 47) | def _return_object(x):
  class PyNativeObject (line 66) | class PyNativeObject:
    method __init_mxnet_object_by_constructor__ (line 71) | def __init_mxnet_object_by_constructor__(self, fconstructor, *args):
  class ObjectBase (line 92) | class ObjectBase(object):
    method __del__ (line 96) | def __del__(self):
    method __init_handle_by_constructor__ (line 100) | def __init_handle_by_constructor__(self, fconstructor, *args):
    method same_as (line 124) | def same_as(self, other):

FILE: python/mxnet/_ffi/_ctypes/types.py
  class TypeCode (line 27) | class TypeCode(object):
  class MXNetValue (line 44) | class MXNetValue(ctypes.Union):

FILE: python/mxnet/_ffi/base.py
  function c_str (line 40) | def c_str(string):
  function c_array (line 55) | def c_array(ctype, values):

FILE: python/mxnet/_ffi/function.py
  class Function (line 47) | class Function(_FunctionBase):
  function get_global_func (line 72) | def get_global_func(name, allow_missing=False):
  function list_global_func_names (line 91) | def list_global_func_names():
  function _get_api (line 110) | def _get_api(f):
  function _init_api (line 116) | def _init_api(namespace, target_module_name=None):
  function _init_api_prefix (line 133) | def _init_api_prefix(module_name, prefix):

FILE: python/mxnet/_ffi/node_generic.py
  function _scalar_type_inference (line 26) | def _scalar_type_inference(value):
  function convert_to_node (line 43) | def convert_to_node(value):
  function const (line 79) | def const(value, dtype=None):

FILE: python/mxnet/_ffi/object.py
  function _new_object (line 42) | def _new_object(cls):
  class Object (line 47) | class Object(_ObjectBase):
  function register_object (line 51) | def register_object(type_key=None):
  function getitem_helper (line 90) | def getitem_helper(obj, elem_getter, length, idx):

FILE: python/mxnet/_ffi/runtime_ctypes.py
  class TVMByteArray (line 24) | class TVMByteArray(ctypes.Structure):

FILE: python/mxnet/_global_var.py
  function _set_ndarray_class (line 23) | def _set_ndarray_class(cls):
  function _set_np_ndarray_class (line 28) | def _set_np_ndarray_class(cls):

FILE: python/mxnet/_numpy_op_doc.py
  function _np_sometrue (line 23) | def _np_sometrue(a, axis=None, keepdims=False, out=None):
  function _npx_nonzero (line 36) | def _npx_nonzero(a):
  function _np_repeat (line 83) | def _np_repeat(a, repeats, axis=None):
  function _np_dot (line 130) | def _np_dot(a, b, out=None):
  function _np_copy (line 192) | def _np_copy(a, out=None):
  function _np_reshape (line 239) | def _np_reshape(a, newshape, order='C', out=None):
  function _np_squeeze (line 300) | def _np_squeeze(a, axis=None, out=None):
  function _np_prod (line 351) | def _np_prod(a, axis=None, dtype=None, out=None, keepdims=False):
  function _np_product (line 434) | def _np_product(a, axis=None, dtype=None, out=None, keepdims=False):
  function _np_moveaxis (line 445) | def _np_moveaxis(a, source, destination):
  function _np__random_shuffle (line 488) | def _np__random_shuffle(x):
  function _npx_constraint_check (line 524) | def _npx_constraint_check(x, msg):
  function _npx_reshape (line 563) | def _npx_reshape(a, newshape, reverse=False, order='C'):
  function _npx_index_add (line 629) | def _npx_index_add(a, ind, val):
  function _npx_index_update (line 705) | def _npx_index_update(a, ind, val):
  function _np_diag (line 774) | def _np_diag(array, k=0):
  function _np_diagonal (line 809) | def _np_diagonal(a, offset=0, axis1=0, axis2=1):
  function _np_diagflat (line 861) | def _np_diagflat(array, k=0):

FILE: python/mxnet/amp/amp.py
  function _cast_symbol_NDArray (line 57) | def _cast_symbol_NDArray(s, dtype, is_numpy_module=False):
  function _get_nd_fun_to_wrap (line 68) | def _get_nd_fun_to_wrap(name, module, submodule_dict):
  function _get_np_fun_to_wrap (line 86) | def _get_np_fun_to_wrap(name, ns_prefix):
  function _wrap_module_functions (line 106) | def _wrap_module_functions(module, is_numpy_module, target_dtype, get_al...
  function _wrap_loss_output_functions (line 255) | def _wrap_loss_output_functions(module, ls, target_dtype):
  function scale_loss (line 291) | def scale_loss(loss, optimizer_or_trainer):
  function warn_if_model_exists (line 301) | def warn_if_model_exists():
  function init (line 309) | def init(target_dtype='float16', target_precision_ops=None,
  function init_trainer (line 379) | def init_trainer(optimizer_or_trainer):
  function unscale (line 407) | def unscale(optimizer_or_trainer):
  function convert_symbol (line 431) | def convert_symbol(sym, input_dtypes, param_dtypes, target_dtype, target...
  function convert_model (line 574) | def convert_model(sym, arg_params, aux_params, input_dtypes, target_dtype,
  function convert_hybrid_block (line 646) | def convert_hybrid_block(block, data_example, target_dtype, target_dtype...
  function list_lp16_ops (line 722) | def list_lp16_ops(target_dtype):
  function list_fp32_ops (line 731) | def list_fp32_ops(target_dtype):
  function list_lp16_fp32_ops (line 740) | def list_lp16_fp32_ops(target_dtype):
  function list_conditional_fp32_ops (line 749) | def list_conditional_fp32_ops(target_dtype):
  function list_widest_type_cast (line 758) | def list_widest_type_cast(target_dtype):
  function list_loss_output_functions (line 767) | def list_loss_output_functions(target_dtype):
  function list_lp16_use_fp32_params (line 776) | def list_lp16_use_fp32_params(target_dtype):

FILE: python/mxnet/amp/loss_scaler.py
  class LossScaler (line 26) | class LossScaler(object):
    method __init__ (line 34) | def __init__(self):
    method loss_scale (line 42) | def loss_scale(self):
    method has_overflow (line 45) | def has_overflow(self, params):

FILE: python/mxnet/attribute.py
  class AttrScope (line 23) | class AttrScope:
    method __init__ (line 35) | def __init__(self, **kwargs):
    method get (line 42) | def get(self, attr):
    method __enter__ (line 64) | def __enter__(self):  # pylint: disable=protected-access
    method __exit__ (line 73) | def __exit__(self, ptype, value, trace):
  function current (line 81) | def current():

FILE: python/mxnet/autograd.py
  function set_recording (line 34) | def set_recording(is_recording): #pylint: disable=redefined-outer-name
  function set_training (line 51) | def set_training(train_mode): #pylint: disable=redefined-outer-name
  function is_recording (line 69) | def is_recording():
  function is_training (line 80) | def is_training():
  class _RecordingStateScope (line 92) | class _RecordingStateScope(object):
    method __init__ (line 102) | def __init__(self, is_record, train_mode): #pylint: disable=redefined-...
    method __enter__ (line 108) | def __enter__(self):
    method __exit__ (line 114) | def __exit__(self, ptype, value, trace):
  function record (line 121) | def record(train_mode=True): #pylint: disable=redefined-outer-name
  function pause (line 145) | def pause(train_mode=False): #pylint: disable=redefined-outer-name
  function train_mode (line 165) | def train_mode():
  function predict_mode (line 180) | def predict_mode():
  function mark_variables (line 196) | def mark_variables(variables, gradients, grad_reqs='write'):
  function _parse_head (line 225) | def _parse_head(heads, head_grads):
  function backward (line 245) | def backward(heads, head_grads=None, retain_graph=False, train_mode=True...
  function grad (line 272) | def grad(heads, variables, head_grads=None, retain_graph=None, create_gr...
  function get_symbol (line 349) | def get_symbol(x):
  class Function (line 369) | class Function(object):
    class _Registry (line 410) | class _Registry(object):
      method __init__ (line 412) | def __init__(self):
      method inc (line 417) | def inc(self):
    method __init__ (line 427) | def __init__(self):
    method save_for_backward (line 431) | def save_for_backward(self, *args):
    method __call__ (line 434) | def __call__(self, *inputs):
    method forward (line 515) | def forward(self, *inputs):
    method backward (line 519) | def backward(self, *output_grads):

FILE: python/mxnet/base.py
  function data_dir_default (line 64) | def data_dir_default():
  function data_dir (line 76) | def data_dir():
  class _NullType (line 84) | class _NullType(object):
    method __repr__ (line 86) | def __repr__(self):
  class MXNetError (line 93) | class MXNetError(RuntimeError):
  function register_error (line 99) | def register_error(func_name=None, cls=None):
  function _valid_error_name (line 140) | def _valid_error_name(name):
  function _find_error_type (line 145) | def _find_error_type(line):
  function c2pyerror (line 166) | def c2pyerror(err_msg):
  class NotImplementedForSymbol (line 208) | class NotImplementedForSymbol(MXNetError):
    method __init__ (line 210) | def __init__(self, function, alias, *args):
    method __str__ (line 216) | def __str__(self):
  function get_last_ffi_error (line 226) | def get_last_ffi_error():
  function check_call (line 241) | def check_call(ret):
  class NotSupportedForSparseNDArray (line 256) | class NotSupportedForSparseNDArray(MXNetError):
    method __init__ (line 258) | def __init__(self, function, alias, *args):
    method __str__ (line 264) | def __str__(self):
  class MXCallbackList (line 274) | class MXCallbackList(ctypes.Structure):
  function _load_lib (line 284) | def _load_lib():
  function c_str (line 350) | def c_str(string):
  function c_str_array (line 371) | def c_str_array(strings):
  function c_array (line 389) | def c_array(ctype, values):
  function c_array_buf (line 418) | def c_array_buf(ctype, buf):
  function c_handle_array (line 447) | def c_handle_array(objs):
  function ctypes2buffer (line 465) | def ctypes2buffer(cptr, length):
  function ctypes2numpy_shared (line 489) | def ctypes2numpy_shared(cptr, shape):
  function build_param_doc (line 516) | def build_param_doc(arg_names, arg_types, arg_descs, remove_dup=True):
  function _notify_shutdown (line 555) | def _notify_shutdown():
  function add_fileline_to_docstring (line 563) | def add_fileline_to_docstring(module, incursive=True):
  function _as_list (line 599) | def _as_list(obj):
  function _get_op_name_prefix (line 621) | def _get_op_name_prefix(op_name):
  function _init_op_module (line 633) | def _init_op_module(root_namespace, module_name, make_op_func):
  function _generate_op_module_signature (line 706) | def _generate_op_module_signature(root_namespace, module_name, op_code_g...
  function _is_np_op (line 838) | def _is_np_op(op_name):
  function _output_is_list (line 843) | def _output_is_list(op_name):
  function _get_op_submodule_name (line 859) | def _get_op_submodule_name(op_name, op_name_prefix, submodule_name_list):
  function _init_np_op_module (line 868) | def _init_np_op_module(root_module_name, np_module_name, mx_module_name,...

FILE: python/mxnet/callback.py
  function do_checkpoint (line 26) | def do_checkpoint(prefix, period=1):
  function log_train_metric (line 64) | def log_train_metric(period, auto_reset=False):
  class Speedometer (line 91) | class Speedometer(object):
    method __init__ (line 113) | def __init__(self, batch_size, frequent=50, auto_reset=True):
    method __call__ (line 121) | def __call__(self, param):
  class ProgressBar (line 155) | class ProgressBar(object):
    method __init__ (line 172) | def __init__(self, total, length=80):
    method __call__ (line 176) | def __call__(self, param):
  class LogValidationMetricsCallback (line 185) | class LogValidationMetricsCallback(object):
    method __call__ (line 188) | def __call__(self, param):

FILE: python/mxnet/container.py
  class ADT (line 26) | class ADT(Object):
    method __init__ (line 37) | def __init__(self, tag, fields):
    method tag (line 44) | def tag(self):
    method __getitem__ (line 47) | def __getitem__(self, idx):
    method __len__ (line 51) | def __len__(self):
  class Map (line 55) | class Map(Object):
    method __getitem__ (line 63) | def __getitem__(self, k):
    method __contains__ (line 66) | def __contains__(self, k):
    method items (line 69) | def items(self):
    method __len__ (line 74) | def __len__(self):
    method get (line 77) | def get(self, key, default=None):
  class String (line 96) | class String(str, PyNativeObject):
    method __new__ (line 107) | def __new__(cls, content):
    method __from_mxnet_object__ (line 114) | def __from_mxnet_object__(cls, obj):

FILE: python/mxnet/context.py
  function Context (line 23) | def Context(*args, **kwargs):
  function current_context (line 29) | def current_context():

FILE: python/mxnet/contrib/io.py
  class DataLoaderIter (line 24) | class DataLoaderIter(DataIter):
    method __init__ (line 52) | def __init__(self, loader, data_name='data', label_name='softmax_label...
    method reset (line 64) | def reset(self):
    method iter_next (line 67) | def iter_next(self):
    method getdata (line 74) | def getdata(self):
    method getlabel (line 82) | def getlabel(self):
    method getpad (line 90) | def getpad(self):
    method getindex (line 93) | def getindex(self):

FILE: python/mxnet/contrib/onnx/__init__.py
  function export_model (line 21) | def export_model(*args, **kwargs):

FILE: python/mxnet/contrib/quantization.py
  function _multilist_iterator (line 36) | def _multilist_iterator(arg, func):
  function _quantize_params (line 50) | def _quantize_params(qsym, params, min_max_dict):
  function _quantize_symbol (line 105) | def _quantize_symbol(sym, device, excluded_symbols=None, excluded_operat...
  class CalibrationCollector (line 179) | class CalibrationCollector(object):
    method __init__ (line 183) | def __init__(self):
    method collect (line 188) | def collect(self, name, op_name, arr):
    method post_collect (line 203) | def post_collect(self):
  class _LayerHistogramCollector (line 210) | class _LayerHistogramCollector(CalibrationCollector):
    method __init__ (line 215) | def __init__(self, quantized_dtype, num_bins=8001, include_layers=None...
    method collect (line 223) | def collect(self, name, op_name, arr):
    method post_collect (line 239) | def post_collect(self):
    method combine_histogram (line 244) | def combine_histogram(old_hist, arr, new_min, new_max, new_th):
    method get_optimal_threshold (line 263) | def get_optimal_threshold(hist_data, quantized_dtype, num_quantized_bi...
    method get_optimal_thresholds (line 287) | def get_optimal_thresholds(hist_dict, quantized_dtype, num_quantized_b...
  class _LayerOutputMinMaxCollector (line 311) | class _LayerOutputMinMaxCollector(CalibrationCollector):
    method __init__ (line 315) | def __init__(self, quantized_dtype, include_layers=None, logger=None):
    method collect (line 322) | def collect(self, name, op_name, arr):
  function _calibrate_quantized_sym (line 339) | def _calibrate_quantized_sym(qsym, min_max_dict):
  function _collect_layer_statistics (line 364) | def _collect_layer_statistics(sym_block, data, collector, num_inputs, nu...
  function _generate_list_of_data_desc (line 382) | def _generate_list_of_data_desc(data_shapes, data_types):
  function quantize_model (line 423) | def quantize_model(sym, arg_params, aux_params, data_names=('data',),
  function quantize_model_onednn (line 571) | def quantize_model_onednn(sym, arg_params, aux_params, data_names=('data...
  function quantize_graph (line 613) | def quantize_graph(sym, arg_params, aux_params, device=cpu(),
  function calib_graph (line 740) | def calib_graph(qsym, arg_params, aux_params, collector,
  function quantize_net (line 799) | def quantize_net(network, quantized_dtype='auto', quantize_mode='full', ...

FILE: python/mxnet/contrib/tensorboard.py
  class LogMetricsCallback (line 24) | class LogMetricsCallback(object):
    method __init__ (line 56) | def __init__(self, logging_dir, prefix=None):
    method __call__ (line 64) | def __call__(self, param):

FILE: python/mxnet/contrib/tensorrt.py
  function set_use_fp16 (line 21) | def set_use_fp16(status):
  function get_use_fp16 (line 30) | def get_use_fp16():
  function init_tensorrt_params (line 37) | def init_tensorrt_params(sym, arg_params, aux_params):

FILE: python/mxnet/contrib/text/embedding.py
  function register (line 40) | def register(embedding_cls):
  function create (line 63) | def create(embedding_name, **kwargs):
  function get_pretrained_file_names (line 90) | def get_pretrained_file_names(embedding_name=None):
  class _TokenEmbedding (line 133) | class _TokenEmbedding(vocab.Vocabulary):
    method __init__ (line 183) | def __init__(self, **kwargs):
    method _get_download_file_name (line 187) | def _get_download_file_name(cls, pretrained_file_name):
    method _get_pretrained_file_url (line 191) | def _get_pretrained_file_url(cls, pretrained_file_name):
    method _get_pretrained_file (line 200) | def _get_pretrained_file(cls, embedding_root, pretrained_file_name):
    method _load_embedding (line 232) | def _load_embedding(self, pretrained_file_path, elem_delim, init_unkno...
    method _index_tokens_from_vocabulary (line 306) | def _index_tokens_from_vocabulary(self, vocabulary):
    method _set_idx_to_vec_by_embeddings (line 315) | def _set_idx_to_vec_by_embeddings(self, token_embeddings, vocab_len, v...
    method _build_embedding_for_vocabulary (line 347) | def _build_embedding_for_vocabulary(self, vocabulary):
    method vec_len (line 361) | def vec_len(self):
    method idx_to_vec (line 365) | def idx_to_vec(self):
    method get_vecs_by_tokens (line 368) | def get_vecs_by_tokens(self, tokens, lower_case_backup=False):
    method update_token_vectors (line 413) | def update_token_vectors(self, tokens, new_vectors):
    method _check_pretrained_file_names (line 460) | def _check_pretrained_file_names(cls, pretrained_file_name):
  class GloVe (line 477) | class GloVe(_TokenEmbedding):
    method _get_download_file_name (line 527) | def _get_download_file_name(cls, pretrained_file_name):
    method __init__ (line 534) | def __init__(self, pretrained_file_name='glove.840B.300d.txt',
  class FastText (line 549) | class FastText(_TokenEmbedding):
    method _get_download_file_name (line 613) | def _get_download_file_name(cls, pretrained_file_name):
    method __init__ (line 617) | def __init__(self, pretrained_file_name='wiki.simple.vec',
  class CustomEmbedding (line 631) | class CustomEmbedding(_TokenEmbedding):
    method __init__ (line 664) | def __init__(self, pretrained_file_path, elem_delim=' ', encoding='utf8',
  class CompositeEmbedding (line 673) | class CompositeEmbedding(_TokenEmbedding):
    method __init__ (line 692) | def __init__(self, vocabulary, token_embeddings):

FILE: python/mxnet/contrib/text/utils.py
  function count_tokens_from_str (line 26) | def count_tokens_from_str(source_str, token_delim=' ', seq_delim='\n',

FILE: python/mxnet/contrib/text/vocab.py
  class Vocabulary (line 28) | class Vocabulary(object):
    method __init__ (line 73) | def __init__(self, counter=None, most_freq_count=None, min_freq=1, unk...
    method _index_unknown_and_reserved_tokens (line 92) | def _index_unknown_and_reserved_tokens(self, unknown_token, reserved_t...
    method _index_counter_keys (line 107) | def _index_counter_keys(self, counter, unknown_token, reserved_tokens,...
    method __len__ (line 135) | def __len__(self):
    method token_to_idx (line 139) | def token_to_idx(self):
    method idx_to_token (line 146) | def idx_to_token(self):
    method unknown_token (line 153) | def unknown_token(self):
    method reserved_tokens (line 157) | def reserved_tokens(self):
    method to_indices (line 160) | def to_indices(self, tokens):
    method to_tokens (line 186) | def to_tokens(self, indices):

FILE: python/mxnet/cuda/__init__.py
  function cuda_profiler_start (line 22) | def cuda_profiler_start():
  function cuda_profiler_stop (line 26) | def cuda_profiler_stop():

FILE: python/mxnet/cuda/nvtx.py
  function range_push (line 34) | def range_push(name, color=ORANGE):
  function range_pop (line 40) | def range_pop():
  class range (line 44) | class range:
    method __init__ (line 45) | def __init__(self, name, color=ORANGE):
    method __enter__ (line 49) | def __enter__(self):
    method __exit__ (line 52) | def __exit__(self, exc_type, exc_val, exc_tb):

FILE: python/mxnet/device.py
  class Device (line 24) | class Device:
    method __init__ (line 67) | def __init__(self, device_type, device_id=0):
    method device_type (line 77) | def device_type(self):
    method __hash__ (line 93) | def __hash__(self):
    method __eq__ (line 97) | def __eq__(self, other):
    method __str__ (line 105) | def __str__(self):
    method __repr__ (line 108) | def __repr__(self):
    method __enter__ (line 111) | def __enter__(self):
    method __exit__ (line 117) | def __exit__(self, ptype, value, trace):
    method empty_cache (line 120) | def empty_cache(self):
  function cpu (line 139) | def cpu(device_id=0):
  function cpu_pinned (line 169) | def cpu_pinned(device_id=0):
  function gpu (line 199) | def gpu(device_id=0):
  function num_gpus (line 231) | def num_gpus():
  function gpu_memory_info (line 249) | def gpu_memory_info(device_id=0):
  function current_device (line 275) | def current_device():

FILE: python/mxnet/dlpack.py
  function _dlpack_deleter (line 36) | def _dlpack_deleter(pycapsule):
  class DLDeviceType (line 45) | class DLDeviceType(enum.IntEnum):
  class DLContext (line 57) | class DLContext(ctypes.Structure):
  class DLDataType (line 61) | class DLDataType(ctypes.Structure):
  class DLTensor (line 78) | class DLTensor(ctypes.Structure):
  class DLManagedTensor (line 87) | class DLManagedTensor(ctypes.Structure):
  function dl_managed_tensor_deleter (line 99) | def dl_managed_tensor_deleter(dl_managed_tensor_handle):
  function ndarray_from_dlpack (line 104) | def ndarray_from_dlpack(array_cls):
  function ndarray_to_dlpack_for_read (line 139) | def ndarray_to_dlpack_for_read():
  function ndarray_to_dlpack_for_write (line 153) | def ndarray_to_dlpack_for_write():
  function ndarray_from_numpy (line 168) | def ndarray_from_numpy(array_cls, array_create_fn):

FILE: python/mxnet/engine.py
  function set_bulk_size (line 25) | def set_bulk_size(size):
  class _BulkScope (line 48) | class _BulkScope(object):
    method __init__ (line 50) | def __init__(self, size):
    method __enter__ (line 54) | def __enter__(self):
    method __exit__ (line 58) | def __exit__(self, ptype, value, trace):
  function bulk (line 62) | def bulk(size):

FILE: python/mxnet/error.py
  class InternalError (line 31) | class InternalError(MXNetError):
    method __init__ (line 46) | def __init__(self, msg):

FILE: python/mxnet/executor.py
  class Executor (line 25) | class Executor:
    method __init__ (line 37) | def __init__(self, sym, device, args, args_grad, grad_req, aux_states,...
    method get_optimized_symbol (line 127) | def get_optimized_symbol(self):
    method forward (line 138) | def forward(self, is_train=False, **kwargs):
    method backward (line 190) | def backward(self, out_grads=None):
    method aux_arrays (line 233) | def aux_arrays(self):
    method arg_arrays (line 243) | def arg_arrays(self):
    method grad_arrays (line 253) | def grad_arrays(self):
    method arg_dict (line 273) | def arg_dict(self):
    method aux_dict (line 292) | def aux_dict(self):
    method grad_dict (line 311) | def grad_dict(self):
    method output_dict (line 326) | def output_dict(self):
    method copy_params_from (line 343) | def copy_params_from(self, arg_params, aux_params=None, allow_extra_pa...

FILE: python/mxnet/gluon/block.py
  function _block_scope (line 55) | def _block_scope(block):
  function _gather_type_device_info (line 72) | def _gather_type_device_info(args):
  function _flatten (line 119) | def _flatten(args, inout_str):
  function _regroup (line 160) | def _regroup(args, fmt):
  class Block (line 204) | class Block:
    method __init__ (line 234) | def __init__(self):
    method __repr__ (line 240) | def __repr__(self):
    method __setattr__ (line 247) | def __setattr__(self, name, value):
    method _check_container_with_block (line 264) | def _check_container_with_block(self):
    method _alias (line 292) | def _alias(self):
    method params (line 296) | def params(self):
    method collect_params (line 301) | def collect_params(self, select=None):
    method _collect_params_with_prefix (line 329) | def _collect_params_with_prefix(self, prefix='', select=None):
    method save_parameters (line 342) | def save_parameters(self, filename, deduplicate=False):
    method load_parameters (line 381) | def load_parameters(self, filename, device=None, allow_missing=False,
    method load_dict (line 436) | def load_dict(self, param_dict, device=None, allow_missing=False,
    method register_child (line 494) | def register_child(self, block, name=None):
    method register_forward_pre_hook (line 501) | def register_forward_pre_hook(self, hook):
    method register_forward_hook (line 520) | def register_forward_hook(self, hook):
    method apply (line 539) | def apply(self, fn):
    method initialize (line 557) | def initialize(self, init=initializer.Uniform(), device=None, verbose=...
    method save (line 579) | def save(self, prefix):
    method load (line 650) | def load(self, prefix):
    method hybridize (line 716) | def hybridize(self, active=True, **kwargs):
    method cast (line 722) | def cast(self, dtype):
    method zero_grad (line 735) | def zero_grad(self):
    method reset_device (line 758) | def reset_device(self, device):
    method reset_ctx (line 771) | def reset_ctx(self, ctx):
    method setattr (line 777) | def setattr(self, name, value):
    method share_parameters (line 800) | def share_parameters(self, shared):
    method _shared_parameters (line 841) | def _shared_parameters(self, shared, shared_set, prefix=""):
    method __call__ (line 852) | def __call__(self, *args):
    method forward (line 865) | def forward(self, *args):
    method register_op_hook (line 877) | def register_op_hook(self, callback, monitor_all=False):
    method summary (line 894) | def summary(self, *inputs):
  class HybridBlock (line 1006) | class HybridBlock(Block):
    class OptConstraint (line 1048) | class OptConstraint:
      class Flag (line 1049) | class Flag(enum.Flag):
      method __init__ (line 1052) | def __init__(self, flag) -> None:
      method __enter__ (line 1056) | def __enter__(self):
      method __exit__ (line 1061) | def __exit__(self, ptype, value, trace):
      method disable_all (line 1065) | def disable_all():
      method disable_amp (line 1071) | def disable_amp():
    method __init__ (line 1074) | def __init__(self):
    method __setattr__ (line 1093) | def __setattr__(self, name, value):
    method generate_arg_names (line 1104) | def generate_arg_names(arg_num):
    method _get_graph (line 1107) | def _get_graph(self, *args):
    method _build_cache (line 1131) | def _build_cache(self, *args, update_graph=True):
    method _deferred_infer_shape (line 1249) | def _deferred_infer_shape(self, *args):
    method _call_cached_op (line 1257) | def _call_cached_op(self, *args):
    method optimize_for (line 1308) | def optimize_for(self, x, *args, backend=None, clear=False,
    method _clear_cached_op (line 1396) | def _clear_cached_op(self):
    method register_child (line 1401) | def register_child(self, block, name=None):
    method hybridize (line 1414) | def hybridize(self, active=True,
    method cast (line 1464) | def cast(self, dtype):
    method _infer_attrs (line 1472) | def _infer_attrs(self, infer_fn, attr, *args):
    method infer_shape (line 1488) | def infer_shape(self, *args):
    method infer_type (line 1502) | def infer_type(self, *args):
    method export (line 1506) | def export(self, path, epoch=0, remove_amp_cast=True):
    method register_op_hook (line 1583) | def register_op_hook(self, callback, monitor_all=False):
    method __call__ (line 1611) | def __call__(self, x, *args):
    method forward (line 1648) | def forward(self, x, *args):
    method reset_device (line 1654) | def reset_device(self, device):
    method reset_ctx (line 1672) | def reset_ctx(self, ctx):
  class SymbolBlock (line 1679) | class SymbolBlock(HybridBlock):
    method imports (line 1712) | def imports(symbol_file, input_names, param_file=None, device=None, al...
    method __repr__ (line 1767) | def __repr__(self):
    method __init__ (line 1776) | def __init__(self, outputs, inputs, params=None):
    method infer_shape (line 1840) | def infer_shape(self, *args):
    method __call__ (line 1844) | def __call__(self, x, *args):
    method forward (line 1856) | def forward(self, x, *args):
    method _clear_cached_op (line 1874) | def _clear_cached_op(self):
    method cast (line 1879) | def cast(self, dtype):
  function _infer_param_types (line 1905) | def _infer_param_types(in_params, out_params, arg_params, aux_params, de...
  function set_optimization_constraints (line 1972) | def set_optimization_constraints(state):
  function get_optimization_constraints (line 1978) | def get_optimization_constraints():

FILE: python/mxnet/gluon/contrib/data/vision/dataloader.py
  function create_image_augment (line 34) | def create_image_augment(data_shape, resize=0, rand_crop=False, rand_res...
  class ImageDataLoader (line 140) | class ImageDataLoader(object):
    method __init__ (line 186) | def __init__(self, batch_size, data_shape, path_imgrec=None, path_imgl...
    method __iter__ (line 240) | def __iter__(self):
    method __len__ (line 243) | def __len__(self):
  function create_bbox_augment (line 246) | def create_bbox_augment(data_shape, rand_crop=0, rand_pad=0, rand_gray=0,
  class ImageBboxDataLoader (line 364) | class ImageBboxDataLoader(object):
    method __init__ (line 405) | def __init__(self, batch_size, data_shape, path_imgrec=None, path_imgl...
    method __iter__ (line 468) | def __iter__(self):
    method __len__ (line 471) | def __len__(self):
  class BboxLabelTransform (line 474) | class BboxLabelTransform(Block):
    method __init__ (line 483) | def __init__(self, coord_normalized=True):
    method forward (line 487) | def forward(self, img, label):

FILE: python/mxnet/gluon/contrib/data/vision/transforms/bbox/bbox.py
  class ImageBboxRandomFlipLeftRight (line 34) | class ImageBboxRandomFlipLeftRight(Block):
    method __init__ (line 55) | def __init__(self, p=0.5):
    method forward (line 59) | def forward(self, img, bbox):
    method _flip_image (line 75) | def _flip_image(self, img):
    method _flip_bbox (line 81) | def _flip_bbox(self, img, bbox):
  class ImageBboxCrop (line 90) | class ImageBboxCrop(Block):
    method __init__ (line 115) | def __init__(self, crop, allow_outside_center=False):
    method forward (line 130) | def forward(self, img, bbox):
  class ImageBboxRandomCropWithConstraints (line 146) | class ImageBboxRandomCropWithConstraints(Block):
    method __init__ (line 187) | def __init__(self, p=0.5, min_scale=0.3, max_scale=1,
    method forward (line 200) | def forward(self, img, bbox):
  class ImageBboxRandomExpand (line 216) | class ImageBboxRandomExpand(Block):
    method __init__ (line 248) | def __init__(self, p=0.5, max_ratio=4, fill=0, keep_ratio=True):
    method forward (line 255) | def forward(self, img, bbox):
  class ImageBboxResize (line 297) | class ImageBboxResize(Block):
    method __init__ (line 321) | def __init__(self, width, height, interp=1):
    method forward (line 326) | def forward(self, img, bbox):

FILE: python/mxnet/gluon/contrib/data/vision/transforms/bbox/utils.py
  function _check_bbox_shape (line 26) | def _check_bbox_shape(bbox):
  function bbox_crop (line 30) | def bbox_crop(bbox, crop_box=None, allow_outside_center=True):
  function bbox_flip (line 85) | def bbox_flip(bbox, size, flip_x=False, flip_y=False):
  function bbox_resize (line 124) | def bbox_resize(bbox, in_size, out_size):
  function bbox_translate (line 159) | def bbox_translate(bbox, x_offset=0, y_offset=0):
  function bbox_iou (line 185) | def bbox_iou(bbox_a, bbox_b, offset=0):
  function bbox_xywh_to_xyxy (line 218) | def bbox_xywh_to_xyxy(xywh):
  function bbox_xyxy_to_xywh (line 252) | def bbox_xyxy_to_xywh(xyxy):
  function bbox_clip_xyxy (line 286) | def bbox_clip_xyxy(xyxy, width, height):
  function bbox_random_crop_with_constraints (line 330) | def bbox_random_crop_with_constraints(bbox, size, min_scale=0.3, max_sca...

FILE: python/mxnet/gluon/contrib/estimator/batch_processor.py
  class BatchProcessor (line 28) | class BatchProcessor(object):
    method __init__ (line 40) | def __init__(self):
    method _get_data_and_label (line 43) | def _get_data_and_label(self, batch, ctx, batch_axis=0):
    method evaluate_batch (line 50) | def evaluate_batch(self, estimator,
    method fit_batch (line 70) | def fit_batch(self, estimator,

FILE: python/mxnet/gluon/contrib/estimator/estimator.py
  class Estimator (line 42) | class Estimator(object):
    method __init__ (line 110) | def __init__(self, net,
    method _check_loss (line 141) | def _check_loss(self, loss):
    method _check_context (line 147) | def _check_context(self, context):
    method _check_devices (line 153) | def _check_devices(self, devices):
    method _check_batch_processor (line 186) | def _check_batch_processor(self, batch_processor):
    method _initialize (line 198) | def _initialize(self, initializer):
    method _check_trainer (line 218) | def _check_trainer(self, trainer):
    method _is_initialized (line 230) | def _is_initialized(self):
    method _get_data_and_label (line 239) | def _get_data_and_label(self, batch, device, batch_axis=0):
    method _add_default_training_metrics (line 246) | def _add_default_training_metrics(self):
    method _add_validation_metrics (line 259) | def _add_validation_metrics(self):
    method train_metrics (line 272) | def train_metrics(self):
    method val_metrics (line 276) | def val_metrics(self):
    method evaluate (line 279) | def evaluate(self,
    method fit (line 333) | def fit(self, train_data,
    method _prepare_default_handlers (line 430) | def _prepare_default_handlers(self, val_data, event_handlers):
    method _prepare_default_validation_handlers (line 468) | def _prepare_default_validation_handlers(self, event_handlers):
    method _categorize_handlers (line 491) | def _categorize_handlers(self, event_handlers):

FILE: python/mxnet/gluon/contrib/estimator/event_handler.py
  class EventHandler (line 37) | class EventHandler(object):
  function _check_event_handlers (line 41) | def _check_event_handlers(handlers):
  class TrainBegin (line 52) | class TrainBegin(EventHandler):
    method train_begin (line 53) | def train_begin(self, estimator, *args, **kwargs):
  class TrainEnd (line 57) | class TrainEnd(EventHandler):
    method train_end (line 58) | def train_end(self, estimator, *args, **kwargs):
  class EpochBegin (line 62) | class EpochBegin(EventHandler):
    method epoch_begin (line 63) | def epoch_begin(self, estimator, *args, **kwargs):
  class EpochEnd (line 67) | class EpochEnd(EventHandler):
    method epoch_end (line 68) | def epoch_end(self, estimator, *args, **kwargs):
  class BatchBegin (line 72) | class BatchBegin(EventHandler):
    method batch_begin (line 73) | def batch_begin(self, estimator, *args, **kwargs):
  class BatchEnd (line 77) | class BatchEnd(EventHandler):
    method batch_end (line 78) | def batch_end(self, estimator, *args, **kwargs):
  class StoppingHandler (line 82) | class StoppingHandler(TrainBegin, BatchEnd, EpochEnd):
    method __init__ (line 96) | def __init__(self, max_epoch=None, max_batch=None):
    method train_begin (line 103) | def train_begin(self, estimator, *args, **kwargs):
    method batch_end (line 109) | def batch_end(self, estimator, *args, **kwargs):
    method epoch_end (line 115) | def epoch_end(self, estimator, *args, **kwargs):
  class MetricHandler (line 122) | class MetricHandler(EpochBegin, BatchEnd):
    method __init__ (line 138) | def __init__(self, metrics, priority=-1000):
    method epoch_begin (line 144) | def epoch_begin(self, estimator, *args, **kwargs):
    method batch_end (line 148) | def batch_end(self, estimator, *args, **kwargs):
  class ValidationHandler (line 160) | class ValidationHandler(TrainBegin, BatchEnd, EpochEnd):
    method __init__ (line 190) | def __init__(self,
    method train_begin (line 208) | def train_begin(self, estimator, *args, **kwargs):
    method batch_end (line 213) | def batch_end(self, estimator, *args, **kwargs):
    method epoch_end (line 219) | def epoch_end(self, estimator, *args, **kwargs):
  class LoggingHandler (line 226) | class LoggingHandler(TrainBegin, TrainEnd, EpochBegin, EpochEnd, BatchBe...
    method __init__ (line 246) | def __init__(self, log_interval='epoch',
    method train_begin (line 262) | def train_begin(self, estimator, *args, **kwargs):
    method train_end (line 280) | def train_end(self, estimator, *args, **kwargs):
    method batch_begin (line 289) | def batch_begin(self, estimator, *args, **kwargs):
    method batch_end (line 293) | def batch_end(self, estimator, *args, **kwargs):
    method epoch_begin (line 310) | def epoch_begin(self, estimator, *args, **kwargs):
    method epoch_end (line 324) | def epoch_end(self, estimator, *args, **kwargs):
  class CheckpointHandler (line 336) | class CheckpointHandler(TrainBegin, BatchEnd, EpochEnd):
    method __init__ (line 377) | def __init__(self,
    method train_begin (line 435) | def train_begin(self, estimator, *args, **kwargs):
    method batch_end (line 455) | def batch_end(self, estimator, *args, **kwargs):
    method epoch_end (line 463) | def epoch_end(self, estimator, *args, **kwargs):
    method _save_checkpoint (line 468) | def _save_checkpoint(self, estimator):
    method _save_symbol (line 513) | def _save_symbol(self, estimator):
    method _save_params_and_trainer (line 525) | def _save_params_and_trainer(self, estimator, file_prefix):
    method _resume_from_checkpoint (line 541) | def _resume_from_checkpoint(self, estimator):
    method _find_max_iteration (line 590) | def _find_max_iteration(self, dir, prefix, start, end, saved_checkpoin...
  class EarlyStoppingHandler (line 611) | class EarlyStoppingHandler(TrainBegin, EpochEnd, TrainEnd):
    method __init__ (line 630) | def __init__(self,
    method train_begin (line 685) | def train_begin(self, estimator, *args, **kwargs):
    method epoch_end (line 695) | def epoch_end(self, estimator, *args, **kwargs):
    method train_end (line 713) | def train_end(self, estimator, *args, **kwargs):
  class GradientUpdateHandler (line 719) | class GradientUpdateHandler(BatchEnd):
    method __init__ (line 731) | def __init__(self, priority=-2000):
    method batch_end (line 734) | def batch_end(self, estimator, *args, **kwargs):

FILE: python/mxnet/gluon/contrib/estimator/utils.py
  function _check_metrics (line 25) | def _check_metrics(metrics):
  function _check_handler_metric_ref (line 37) | def _check_handler_metric_ref(handler, known_metrics):
  function _check_metric_known (line 49) | def _check_metric_known(handler, metric, known_metrics):
  function _sugges
Copy disabled (too large) Download .json
Condensed preview — 2643 files, each showing path, character count, and a content snippet. Download the .json file for the full structured content (16,331K chars).
[
  {
    "path": ".asf.yaml",
    "chars": 296,
    "preview": "notifications:\n    commits:      commits@mxnet.apache.org\n    issues:       issues@mxnet.apache.org\n    pullrequests: co"
  },
  {
    "path": ".clang-format",
    "chars": 1542,
    "preview": "# Licensed to the Apache Software Foundation (ASF) under one\n# or more contributor license agreements.  See the NOTICE f"
  },
  {
    "path": ".clang-tidy",
    "chars": 3121,
    "preview": "# Licensed to the Apache Software Foundation (ASF) under one\n# or more contributor license agreements.  See the NOTICE f"
  },
  {
    "path": ".cmakelintrc",
    "chars": 1241,
    "preview": "# Licensed to the Apache Software Foundation (ASF) under one\n# or more contributor license agreements.  See the NOTICE f"
  },
  {
    "path": ".codecov.yml",
    "chars": 519,
    "preview": "# Codecov.io configuration file\n# See https://docs.codecov.io/docs/codecovyml-reference\ncodecov:\n  notify:\n    require_c"
  },
  {
    "path": ".git-blame-ignore-revs",
    "chars": 980,
    "preview": "# Licensed to the Apache Software Foundation (ASF) under one\n# or more contributor license agreements.  See the NOTICE f"
  },
  {
    "path": ".gitattributes",
    "chars": 55,
    "preview": ".gitattributes export-ignore\nR-package/* export-ignore\n"
  },
  {
    "path": ".github/ISSUE_TEMPLATE/bug_report.md",
    "chars": 1004,
    "preview": "---\nname: Bug report\nabout: Create a report to help us improve\ntitle: ''\nlabels: 'Bug, needs triage'\nassignees: ''\n\n---\n"
  },
  {
    "path": ".github/ISSUE_TEMPLATE/config.yml",
    "chars": 474,
    "preview": "blank_issues_enabled: false\ncontact_links:\n  - name: GitHub Discussions\n    url: https://github.com/apache/mxnet/discuss"
  },
  {
    "path": ".github/ISSUE_TEMPLATE/feature_request.md",
    "chars": 431,
    "preview": "---\nname: Feature request\nabout: Suggest an idea for this project\ntitle: ''\nlabels: 'Feature request'\nassignees: ''\n\n---"
  },
  {
    "path": ".github/ISSUE_TEMPLATE/flaky_test.md",
    "chars": 245,
    "preview": "---\nname: Flaky test\nabout: Report a flaky test\ntitle: ''\nlabels: 'Flaky'\nassignees: ''\n\n---\n## Description\n(The locatio"
  },
  {
    "path": ".github/ISSUE_TEMPLATE/rfc.md",
    "chars": 559,
    "preview": "---\nname: Request for comment (RFC)\nabout: RFC process requests for review on the design of a new feature or bug fix tha"
  },
  {
    "path": ".github/PULL_REQUEST_TEMPLATE.md",
    "chars": 590,
    "preview": "## Description ##\n(Brief description on what this PR is about)\n\n## Checklist ##\n### Essentials ###\n- [ ] PR's title star"
  },
  {
    "path": ".github/workflows/greetings.yml",
    "chars": 1577,
    "preview": "name: Greetings\n\non: [pull_request, issues]\n\njobs:\n  greeting:\n    runs-on: ubuntu-latest\n    steps:\n    - uses: actions"
  },
  {
    "path": ".github/workflows/license_check.yml",
    "chars": 487,
    "preview": "name: license check\n\non: [push, pull_request]\n\ndefaults:\n  run:\n    shell: bash\n\njobs:\n  licensecheck:\n    runs-on: ubun"
  },
  {
    "path": ".github/workflows/link_check.yml",
    "chars": 2386,
    "preview": "name: link check\n\non: [push, pull_request]\n\ndefaults:\n  run:\n    shell: bash\n\njobs:\n  linkcheck:\n    runs-on: ubuntu-20."
  },
  {
    "path": ".github/workflows/os_x_mklbuild.yml",
    "chars": 2603,
    "preview": "name: mkl continuous build\n\non: [push, pull_request]\n\njobs:\n  macosx-x86_64:\n    runs-on: macos-10.15\n    steps:\n      -"
  },
  {
    "path": ".github/workflows/os_x_staticbuild.yml",
    "chars": 4868,
    "preview": "name: continuous build\n\non: [push, pull_request]\n\njobs:\n  macosx-x86_64:\n    runs-on: macos-latest\n    steps:\n      - na"
  },
  {
    "path": ".gitignore",
    "chars": 1952,
    "preview": "# Compiled Object files\n*.slo\n*.lo\n*.o\n*.obj\n\n# Precompiled Headers\n*.gch\n*.pch\n\n# Compiled Dynamic libraries\n*.so\n*.dyl"
  },
  {
    "path": ".gitmodules",
    "chars": 1011,
    "preview": "[submodule \"3rdparty/dmlc-core\"]\n\tpath = 3rdparty/dmlc-core\n\turl = https://github.com/dmlc/dmlc-core.git\n[submodule \"3rd"
  },
  {
    "path": ".licenserc.yaml",
    "chars": 3269,
    "preview": "header:\n  license:\n    spdx-id: Apache-2.0\n    copyright-owner: Apache Software Foundation\n\n  paths-ignore:\n    - 'licen"
  },
  {
    "path": ".mxnet_root",
    "chars": 853,
    "preview": "# Licensed to the Apache Software Foundation (ASF) under one\n# or more contributor license agreements.  See the NOTICE f"
  },
  {
    "path": "3rdparty/ctc_include/LICENSE",
    "chars": 11405,
    "preview": "                                 Apache License\n                           Version 2.0, January 2004\n                   "
  },
  {
    "path": "3rdparty/ctc_include/contrib/moderngpu/LICENSE",
    "chars": 1721,
    "preview": "/******************************************************************************\n* Copyright (c) 2013, NVIDIA CORPORATION"
  },
  {
    "path": "3rdparty/ctc_include/contrib/moderngpu/include/device/ctaloadbalance.cuh",
    "chars": 5142,
    "preview": "/******************************************************************************\n * Copyright (c) 2013, NVIDIA CORPORATIO"
  },
  {
    "path": "3rdparty/ctc_include/contrib/moderngpu/include/device/ctamerge.cuh",
    "chars": 12005,
    "preview": "/******************************************************************************\n * Copyright (c) 2013, NVIDIA CORPORATIO"
  },
  {
    "path": "3rdparty/ctc_include/contrib/moderngpu/include/device/ctascan.cuh",
    "chars": 9360,
    "preview": "/******************************************************************************\n * Copyright (c) 2013, NVIDIA CORPORATIO"
  },
  {
    "path": "3rdparty/ctc_include/contrib/moderngpu/include/device/ctasearch.cuh",
    "chars": 7116,
    "preview": "/******************************************************************************\n * Copyright (c) 2013, NVIDIA CORPORATIO"
  },
  {
    "path": "3rdparty/ctc_include/contrib/moderngpu/include/device/ctasegreduce.cuh",
    "chars": 8341,
    "preview": "/******************************************************************************\n * Copyright (c) 2013, NVIDIA CORPORATIO"
  },
  {
    "path": "3rdparty/ctc_include/contrib/moderngpu/include/device/ctasegscan.cuh",
    "chars": 5109,
    "preview": "/******************************************************************************\n * Copyright (c) 2013, NVIDIA CORPORATIO"
  },
  {
    "path": "3rdparty/ctc_include/contrib/moderngpu/include/device/ctasegsort.cuh",
    "chars": 15901,
    "preview": "/******************************************************************************\n * Copyright (c) 2013, NVIDIA CORPORATIO"
  },
  {
    "path": "3rdparty/ctc_include/contrib/moderngpu/include/device/ctasortedsearch.cuh",
    "chars": 8199,
    "preview": "/******************************************************************************\n * Copyright (c) 2013, NVIDIA CORPORATIO"
  },
  {
    "path": "3rdparty/ctc_include/contrib/moderngpu/include/device/devicetypes.cuh",
    "chars": 11556,
    "preview": "/******************************************************************************\n * Copyright (c) 2013, NVIDIA CORPORATIO"
  },
  {
    "path": "3rdparty/ctc_include/contrib/moderngpu/include/device/deviceutil.cuh",
    "chars": 4951,
    "preview": "/******************************************************************************\n * Copyright (c) 2013, NVIDIA CORPORATIO"
  },
  {
    "path": "3rdparty/ctc_include/contrib/moderngpu/include/device/intrinsics.cuh",
    "chars": 11593,
    "preview": "/******************************************************************************\n * Copyright (c) 2013, NVIDIA CORPORATIO"
  },
  {
    "path": "3rdparty/ctc_include/contrib/moderngpu/include/device/loadstore.cuh",
    "chars": 20359,
    "preview": "/******************************************************************************\n * Copyright (c) 2013, NVIDIA CORPORATIO"
  },
  {
    "path": "3rdparty/ctc_include/contrib/moderngpu/include/device/serialsets.cuh",
    "chars": 7299,
    "preview": "/******************************************************************************\n * Copyright (c) 2013, NVIDIA CORPORATIO"
  },
  {
    "path": "3rdparty/ctc_include/contrib/moderngpu/include/device/sortnetwork.cuh",
    "chars": 6722,
    "preview": "/******************************************************************************\n * Copyright (c) 2013, NVIDIA CORPORATIO"
  },
  {
    "path": "3rdparty/ctc_include/contrib/moderngpu/include/mgpudevice.cuh",
    "chars": 11300,
    "preview": "/******************************************************************************\n * Copyright (c) 2013, NVIDIA CORPORATIO"
  },
  {
    "path": "3rdparty/ctc_include/contrib/moderngpu/include/mgpuenums.h",
    "chars": 2523,
    "preview": "/******************************************************************************\n * Copyright (c) 2013, NVIDIA CORPORATIO"
  },
  {
    "path": "3rdparty/ctc_include/contrib/moderngpu/include/util/static.h",
    "chars": 5805,
    "preview": "/******************************************************************************\n * Copyright (c) 2013, NVIDIA CORPORATIO"
  },
  {
    "path": "3rdparty/ctc_include/detail/cpu_ctc.h",
    "chars": 18756,
    "preview": "/*\n * Licensed to the Apache Software Foundation (ASF) under one\n * or more contributor license agreements.  See the NOT"
  },
  {
    "path": "3rdparty/ctc_include/detail/ctc_helper.h",
    "chars": 2511,
    "preview": "/*\n * Licensed to the Apache Software Foundation (ASF) under one\n * or more contributor license agreements.  See the NOT"
  },
  {
    "path": "3rdparty/ctc_include/detail/gpu_ctc.h",
    "chars": 18793,
    "preview": "/*\n * Licensed to the Apache Software Foundation (ASF) under one\n * or more contributor license agreements.  See the NOT"
  },
  {
    "path": "3rdparty/ctc_include/detail/gpu_ctc_kernels.h",
    "chars": 17667,
    "preview": "/*\n * Licensed to the Apache Software Foundation (ASF) under one\n * or more contributor license agreements.  See the NOT"
  },
  {
    "path": "3rdparty/ctc_include/detail/hostdevice.h",
    "chars": 921,
    "preview": "/*\n * Licensed to the Apache Software Foundation (ASF) under one\n * or more contributor license agreements.  See the NOT"
  },
  {
    "path": "3rdparty/miniz/miniz.c",
    "chars": 316287,
    "preview": "/**************************************************************************\n *\n * Copyright 2013-2014 RAD Game Tools and"
  },
  {
    "path": "3rdparty/miniz/miniz.h",
    "chars": 68972,
    "preview": "/**************************************************************************\n *\n * Copyright 2013-2014 RAD Game Tools and"
  },
  {
    "path": "3rdparty/mshadow/.gitignore",
    "chars": 181,
    "preview": "# Compiled Object files\n*.slo\n*.lo\n*.o\n\n# Compiled Dynamic libraries\n*.so\n*.dylib\n\n# Compiled Static libraries\n*.lai\n*.l"
  },
  {
    "path": "3rdparty/mshadow/.travis.yml",
    "chars": 777,
    "preview": "# disable sudo to use container based build\nsudo: false\n\n# Use Build Matrix to do lint and build seperately\nenv:\n  matri"
  },
  {
    "path": "3rdparty/mshadow/CHANGES.md",
    "chars": 1106,
    "preview": "<!--- Licensed to the Apache Software Foundation (ASF) under one -->\n<!--- or more contributor license agreements.  See "
  },
  {
    "path": "3rdparty/mshadow/CMakeLists.txt",
    "chars": 4098,
    "preview": "cmake_minimum_required(VERSION 3.13)\nproject(mshadow C CXX)\n\ninclude(CMakeDependentOption)\noption(USE_CUDA \"Build with C"
  },
  {
    "path": "3rdparty/mshadow/LICENSE",
    "chars": 527,
    "preview": "Licensed under the Apache License, Version 2.0 (the \"License\");\nyou may not use this file except in compliance with the "
  },
  {
    "path": "3rdparty/mshadow/README.md",
    "chars": 3176,
    "preview": "<!--- Licensed to the Apache Software Foundation (ASF) under one -->\n<!--- or more contributor license agreements.  See "
  },
  {
    "path": "3rdparty/mshadow/cmake/AutoDetectF16C.cmake",
    "chars": 1823,
    "preview": "# Licensed to the Apache Software Foundation (ASF) under one\n# or more contributor license agreements.  See the NOTICE f"
  },
  {
    "path": "3rdparty/mshadow/doc/Doxyfile",
    "chars": 103766,
    "preview": "# Doxyfile 1.8.8\n\n# Licensed to the Apache Software Foundation (ASF) under one\n# or more contributor license agreements."
  },
  {
    "path": "3rdparty/mshadow/doc/README.md",
    "chars": 14490,
    "preview": "<!--- Licensed to the Apache Software Foundation (ASF) under one -->\n<!--- or more contributor license agreements.  See "
  },
  {
    "path": "3rdparty/mshadow/doc/mkdoc.sh",
    "chars": 833,
    "preview": "#!/bin/bash\n\n# Licensed to the Apache Software Foundation (ASF) under one\n# or more contributor license agreements.  See"
  },
  {
    "path": "3rdparty/mshadow/guide/.gitignore",
    "chars": 22,
    "preview": "defop\nbasic\nconfig.mk\n"
  },
  {
    "path": "3rdparty/mshadow/guide/Makefile",
    "chars": 1661,
    "preview": "# Licensed to the Apache Software Foundation (ASF) under one\n# or more contributor license agreements.  See the NOTICE f"
  },
  {
    "path": "3rdparty/mshadow/guide/README.md",
    "chars": 10128,
    "preview": "<!--- Licensed to the Apache Software Foundation (ASF) under one -->\n<!--- or more contributor license agreements.  See "
  },
  {
    "path": "3rdparty/mshadow/guide/basic.cpp",
    "chars": 5490,
    "preview": "/*\n * Licensed to the Apache Software Foundation (ASF) under one\n * or more contributor license agreements.  See the NOT"
  },
  {
    "path": "3rdparty/mshadow/guide/basic_stream.cu",
    "chars": 2109,
    "preview": "/*\n * Licensed to the Apache Software Foundation (ASF) under one\n * or more contributor license agreements.  See the NOT"
  },
  {
    "path": "3rdparty/mshadow/guide/defop.cpp",
    "chars": 2245,
    "preview": "/*\n * Licensed to the Apache Software Foundation (ASF) under one\n * or more contributor license agreements.  See the NOT"
  },
  {
    "path": "3rdparty/mshadow/guide/exp-template/.gitignore",
    "chars": 5,
    "preview": "exp_*"
  },
  {
    "path": "3rdparty/mshadow/guide/exp-template/Makefile",
    "chars": 1148,
    "preview": "# Licensed to the Apache Software Foundation (ASF) under one\n# or more contributor license agreements.  See the NOTICE f"
  },
  {
    "path": "3rdparty/mshadow/guide/exp-template/README.md",
    "chars": 13935,
    "preview": "<!--- Licensed to the Apache Software Foundation (ASF) under one -->\n<!--- or more contributor license agreements.  See "
  },
  {
    "path": "3rdparty/mshadow/guide/mshadow-ps/.gitignore",
    "chars": 20,
    "preview": "log\n*cpu\n*gpu\ncore*\n"
  },
  {
    "path": "3rdparty/mshadow/guide/mshadow-ps/Makefile",
    "chars": 1869,
    "preview": "# Licensed to the Apache Software Foundation (ASF) under one\n# or more contributor license agreements.  See the NOTICE f"
  },
  {
    "path": "3rdparty/mshadow/guide/mshadow-ps/README.md",
    "chars": 10951,
    "preview": "<!--- Licensed to the Apache Software Foundation (ASF) under one -->\n<!--- or more contributor license agreements.  See "
  },
  {
    "path": "3rdparty/mshadow/guide/mshadow-ps/dbstr.h",
    "chars": 1647,
    "preview": "/*\n * Licensed to the Apache Software Foundation (ASF) under one\n * or more contributor license agreements.  See the NOT"
  },
  {
    "path": "3rdparty/mshadow/guide/mshadow-ps/dist_async_sum-inl.h",
    "chars": 4425,
    "preview": "/*\n * Licensed to the Apache Software Foundation (ASF) under one\n * or more contributor license agreements.  See the NOT"
  },
  {
    "path": "3rdparty/mshadow/guide/mshadow-ps/dist_async_sum.cpp",
    "chars": 1054,
    "preview": "/*\n * Licensed to the Apache Software Foundation (ASF) under one\n * or more contributor license agreements.  See the NOT"
  },
  {
    "path": "3rdparty/mshadow/guide/mshadow-ps/local.sh",
    "chars": 1699,
    "preview": "#!/bin/bash\n\n# Licensed to the Apache Software Foundation (ASF) under one\n# or more contributor license agreements.  See"
  },
  {
    "path": "3rdparty/mshadow/guide/mshadow-ps/local_sum-inl.h",
    "chars": 4638,
    "preview": "/*\n * Licensed to the Apache Software Foundation (ASF) under one\n * or more contributor license agreements.  See the NOT"
  },
  {
    "path": "3rdparty/mshadow/guide/mshadow-ps/local_sum.cpp",
    "chars": 915,
    "preview": "/*\n * Licensed to the Apache Software Foundation (ASF) under one\n * or more contributor license agreements.  See the NOT"
  },
  {
    "path": "3rdparty/mshadow/guide/mshadow-ps/local_sum.cu",
    "chars": 915,
    "preview": "/*\n * Licensed to the Apache Software Foundation (ASF) under one\n * or more contributor license agreements.  See the NOT"
  },
  {
    "path": "3rdparty/mshadow/guide/neuralnet/Makefile",
    "chars": 1665,
    "preview": "# Licensed to the Apache Software Foundation (ASF) under one\n# or more contributor license agreements.  See the NOTICE f"
  },
  {
    "path": "3rdparty/mshadow/guide/neuralnet/README.md",
    "chars": 1511,
    "preview": "<!--- Licensed to the Apache Software Foundation (ASF) under one -->\n<!--- or more contributor license agreements.  See "
  },
  {
    "path": "3rdparty/mshadow/guide/neuralnet/convnet.cu",
    "chars": 11641,
    "preview": "/*\n * Licensed to the Apache Software Foundation (ASF) under one\n * or more contributor license agreements.  See the NOT"
  },
  {
    "path": "3rdparty/mshadow/guide/neuralnet/nnet.cu",
    "chars": 7084,
    "preview": "/*\n * Licensed to the Apache Software Foundation (ASF) under one\n * or more contributor license agreements.  See the NOT"
  },
  {
    "path": "3rdparty/mshadow/guide/neuralnet/nnet_ps.cu",
    "chars": 10916,
    "preview": "/*\n * Licensed to the Apache Software Foundation (ASF) under one\n * or more contributor license agreements.  See the NOT"
  },
  {
    "path": "3rdparty/mshadow/guide/neuralnet/util.h",
    "chars": 3206,
    "preview": "/*\n * Licensed to the Apache Software Foundation (ASF) under one\n * or more contributor license agreements.  See the NOT"
  },
  {
    "path": "3rdparty/mshadow/make/README.md",
    "chars": 1870,
    "preview": "<!--- Licensed to the Apache Software Foundation (ASF) under one -->\n<!--- or more contributor license agreements.  See "
  },
  {
    "path": "3rdparty/mshadow/make/mshadow.mk",
    "chars": 5973,
    "preview": "# Licensed to the Apache Software Foundation (ASF) under one\n# or more contributor license agreements.  See the NOTICE f"
  },
  {
    "path": "3rdparty/mshadow/mshadow/README.md",
    "chars": 1299,
    "preview": "<!--- Licensed to the Apache Software Foundation (ASF) under one -->\n<!--- or more contributor license agreements.  See "
  },
  {
    "path": "3rdparty/mshadow/mshadow/base.h",
    "chars": 69451,
    "preview": "/*\n * Licensed to the Apache Software Foundation (ASF) under one\n * or more contributor license agreements.  See the NOT"
  },
  {
    "path": "3rdparty/mshadow/mshadow/bfloat.h",
    "chars": 6991,
    "preview": "/*\n * Licensed to the Apache Software Foundation (ASF) under one\n * or more contributor license agreements.  See the NOT"
  },
  {
    "path": "3rdparty/mshadow/mshadow/cuda/reduce.cuh",
    "chars": 4518,
    "preview": "/*\n * Licensed to the Apache Software Foundation (ASF) under one\n * or more contributor license agreements.  See the NOT"
  },
  {
    "path": "3rdparty/mshadow/mshadow/cuda/tensor_gpu-inl.cuh",
    "chars": 35405,
    "preview": "/*\n * Licensed to the Apache Software Foundation (ASF) under one\n * or more contributor license agreements.  See the NOT"
  },
  {
    "path": "3rdparty/mshadow/mshadow/dot_engine-inl.h",
    "chars": 42534,
    "preview": "/*\n * Licensed to the Apache Software Foundation (ASF) under one\n * or more contributor license agreements.  See the NOT"
  },
  {
    "path": "3rdparty/mshadow/mshadow/expr_engine-inl.h",
    "chars": 18275,
    "preview": "/*\n * Licensed to the Apache Software Foundation (ASF) under one\n * or more contributor license agreements.  See the NOT"
  },
  {
    "path": "3rdparty/mshadow/mshadow/expr_scalar-inl.h",
    "chars": 7544,
    "preview": "/*\n * Licensed to the Apache Software Foundation (ASF) under one\n * or more contributor license agreements.  See the NOT"
  },
  {
    "path": "3rdparty/mshadow/mshadow/expression.h",
    "chars": 16361,
    "preview": "/*\n * Licensed to the Apache Software Foundation (ASF) under one\n * or more contributor license agreements.  See the NOT"
  },
  {
    "path": "3rdparty/mshadow/mshadow/extension/broadcast.h",
    "chars": 6869,
    "preview": "/*\n * Licensed to the Apache Software Foundation (ASF) under one\n * or more contributor license agreements.  See the NOT"
  },
  {
    "path": "3rdparty/mshadow/mshadow/extension/broadcast_with_axis.h",
    "chars": 10764,
    "preview": "/*\n * Licensed to the Apache Software Foundation (ASF) under one\n * or more contributor license agreements.  See the NOT"
  },
  {
    "path": "3rdparty/mshadow/mshadow/extension/channel_pool.h",
    "chars": 5034,
    "preview": "/*\n * Licensed to the Apache Software Foundation (ASF) under one\n * or more contributor license agreements.  See the NOT"
  },
  {
    "path": "3rdparty/mshadow/mshadow/extension/channel_unpool.h",
    "chars": 6180,
    "preview": "/*\n * Licensed to the Apache Software Foundation (ASF) under one\n * or more contributor license agreements.  See the NOT"
  },
  {
    "path": "3rdparty/mshadow/mshadow/extension/choose.h",
    "chars": 3997,
    "preview": "/*\n * Licensed to the Apache Software Foundation (ASF) under one\n * or more contributor license agreements.  See the NOT"
  },
  {
    "path": "3rdparty/mshadow/mshadow/extension/complex.h",
    "chars": 20028,
    "preview": "/*\n * Licensed to the Apache Software Foundation (ASF) under one\n * or more contributor license agreements.  See the NOT"
  },
  {
    "path": "3rdparty/mshadow/mshadow/extension/concat.h",
    "chars": 7605,
    "preview": "/*\n * Licensed to the Apache Software Foundation (ASF) under one\n * or more contributor license agreements.  See the NOT"
  },
  {
    "path": "3rdparty/mshadow/mshadow/extension/crop.h",
    "chars": 5468,
    "preview": "/*\n * Licensed to the Apache Software Foundation (ASF) under one\n * or more contributor license agreements.  See the NOT"
  },
  {
    "path": "3rdparty/mshadow/mshadow/extension/fill.h",
    "chars": 4789,
    "preview": "/*\n * Licensed to the Apache Software Foundation (ASF) under one\n * or more contributor license agreements.  See the NOT"
  },
  {
    "path": "3rdparty/mshadow/mshadow/extension/flip.h",
    "chars": 4906,
    "preview": "/*\n * Licensed to the Apache Software Foundation (ASF) under one\n * or more contributor license agreements.  See the NOT"
  },
  {
    "path": "3rdparty/mshadow/mshadow/extension/implicit_gemm.h",
    "chars": 4939,
    "preview": "/*\n * Licensed to the Apache Software Foundation (ASF) under one\n * or more contributor license agreements.  See the NOT"
  },
  {
    "path": "3rdparty/mshadow/mshadow/extension/mask.h",
    "chars": 3545,
    "preview": "/*\n * Licensed to the Apache Software Foundation (ASF) under one\n * or more contributor license agreements.  See the NOT"
  },
  {
    "path": "3rdparty/mshadow/mshadow/extension/mirror.h",
    "chars": 2770,
    "preview": "/*\n * Licensed to the Apache Software Foundation (ASF) under one\n * or more contributor license agreements.  See the NOT"
  },
  {
    "path": "3rdparty/mshadow/mshadow/extension/one_hot.h",
    "chars": 3417,
    "preview": "/*\n * Licensed to the Apache Software Foundation (ASF) under one\n * or more contributor license agreements.  See the NOT"
  },
  {
    "path": "3rdparty/mshadow/mshadow/extension/pack_col2patch.h",
    "chars": 7224,
    "preview": "/*\n * Licensed to the Apache Software Foundation (ASF) under one\n * or more contributor license agreements.  See the NOT"
  },
  {
    "path": "3rdparty/mshadow/mshadow/extension/pad.h",
    "chars": 4622,
    "preview": "/*\n * Licensed to the Apache Software Foundation (ASF) under one\n * or more contributor license agreements.  See the NOT"
  },
  {
    "path": "3rdparty/mshadow/mshadow/extension/range.h",
    "chars": 4497,
    "preview": "/*\n * Licensed to the Apache Software Foundation (ASF) under one\n * or more contributor license agreements.  See the NOT"
  },
  {
    "path": "3rdparty/mshadow/mshadow/extension/reduce_with_axis.h",
    "chars": 5461,
    "preview": "/*\n * Licensed to the Apache Software Foundation (ASF) under one\n * or more contributor license agreements.  See the NOT"
  },
  {
    "path": "3rdparty/mshadow/mshadow/extension/reduceto1d.h",
    "chars": 5000,
    "preview": "/*\n * Licensed to the Apache Software Foundation (ASF) under one\n * or more contributor license agreements.  See the NOT"
  },
  {
    "path": "3rdparty/mshadow/mshadow/extension/reshape.h",
    "chars": 3691,
    "preview": "/*\n * Licensed to the Apache Software Foundation (ASF) under one\n * or more contributor license agreements.  See the NOT"
  },
  {
    "path": "3rdparty/mshadow/mshadow/extension/slice.h",
    "chars": 6028,
    "preview": "/*\n * Licensed to the Apache Software Foundation (ASF) under one\n * or more contributor license agreements.  See the NOT"
  },
  {
    "path": "3rdparty/mshadow/mshadow/extension/slice_ex.h",
    "chars": 5091,
    "preview": "/*\n * Licensed to the Apache Software Foundation (ASF) under one\n * or more contributor license agreements.  See the NOT"
  },
  {
    "path": "3rdparty/mshadow/mshadow/extension/spatial_pool.h",
    "chars": 6794,
    "preview": "/*\n * Licensed to the Apache Software Foundation (ASF) under one\n * or more contributor license agreements.  See the NOT"
  },
  {
    "path": "3rdparty/mshadow/mshadow/extension/spatial_unpool.h",
    "chars": 6377,
    "preview": "/*\n * Licensed to the Apache Software Foundation (ASF) under one\n * or more contributor license agreements.  See the NOT"
  },
  {
    "path": "3rdparty/mshadow/mshadow/extension/spatial_upsampling_nearest.h",
    "chars": 3134,
    "preview": "/*\n * Licensed to the Apache Software Foundation (ASF) under one\n * or more contributor license agreements.  See the NOT"
  },
  {
    "path": "3rdparty/mshadow/mshadow/extension/swapaxis.h",
    "chars": 4716,
    "preview": "/*\n * Licensed to the Apache Software Foundation (ASF) under one\n * or more contributor license agreements.  See the NOT"
  },
  {
    "path": "3rdparty/mshadow/mshadow/extension/take.h",
    "chars": 3554,
    "preview": "/*\n * Licensed to the Apache Software Foundation (ASF) under one\n * or more contributor license agreements.  See the NOT"
  },
  {
    "path": "3rdparty/mshadow/mshadow/extension/take_grad.h",
    "chars": 4126,
    "preview": "/*\n * Licensed to the Apache Software Foundation (ASF) under one\n * or more contributor license agreements.  See the NOT"
  },
  {
    "path": "3rdparty/mshadow/mshadow/extension/transpose.h",
    "chars": 8537,
    "preview": "/*\n * Licensed to the Apache Software Foundation (ASF) under one\n * or more contributor license agreements.  See the NOT"
  },
  {
    "path": "3rdparty/mshadow/mshadow/extension/unpack_patch2col.h",
    "chars": 7171,
    "preview": "/*\n * Licensed to the Apache Software Foundation (ASF) under one\n * or more contributor license agreements.  See the NOT"
  },
  {
    "path": "3rdparty/mshadow/mshadow/extension.h",
    "chars": 2145,
    "preview": "/*\n * Licensed to the Apache Software Foundation (ASF) under one\n * or more contributor license agreements.  See the NOT"
  },
  {
    "path": "3rdparty/mshadow/mshadow/half.h",
    "chars": 14370,
    "preview": "/*\n * Licensed to the Apache Software Foundation (ASF) under one\n * or more contributor license agreements.  See the NOT"
  },
  {
    "path": "3rdparty/mshadow/mshadow/io.h",
    "chars": 6327,
    "preview": "/*\n * Licensed to the Apache Software Foundation (ASF) under one\n * or more contributor license agreements.  See the NOT"
  },
  {
    "path": "3rdparty/mshadow/mshadow/packet/plain-inl.h",
    "chars": 3238,
    "preview": "/*\n * Licensed to the Apache Software Foundation (ASF) under one\n * or more contributor license agreements.  See the NOT"
  },
  {
    "path": "3rdparty/mshadow/mshadow/packet/sse-inl.h",
    "chars": 5766,
    "preview": "/*\n * Licensed to the Apache Software Foundation (ASF) under one\n * or more contributor license agreements.  See the NOT"
  },
  {
    "path": "3rdparty/mshadow/mshadow/packet-inl.h",
    "chars": 14479,
    "preview": "/*\n * Licensed to the Apache Software Foundation (ASF) under one\n * or more contributor license agreements.  See the NOT"
  },
  {
    "path": "3rdparty/mshadow/mshadow/random.h",
    "chars": 18158,
    "preview": "/*\n * Licensed to the Apache Software Foundation (ASF) under one\n * or more contributor license agreements.  See the NOT"
  },
  {
    "path": "3rdparty/mshadow/mshadow/stream_gpu-inl.h",
    "chars": 10104,
    "preview": "/*\n * Licensed to the Apache Software Foundation (ASF) under one\n * or more contributor license agreements.  See the NOT"
  },
  {
    "path": "3rdparty/mshadow/mshadow/tensor.h",
    "chars": 43209,
    "preview": "/*\n * Licensed to the Apache Software Foundation (ASF) under one\n * or more contributor license agreements.  See the NOT"
  },
  {
    "path": "3rdparty/mshadow/mshadow/tensor_container.h",
    "chars": 7031,
    "preview": "/*\n * Licensed to the Apache Software Foundation (ASF) under one\n * or more contributor license agreements.  See the NOT"
  },
  {
    "path": "3rdparty/mshadow/mshadow/tensor_cpu-inl.h",
    "chars": 23709,
    "preview": "/*\n * Licensed to the Apache Software Foundation (ASF) under one\n * or more contributor license agreements.  See the NOT"
  },
  {
    "path": "3rdparty/mshadow/mshadow/tensor_gpu-inl.h",
    "chars": 11075,
    "preview": "/*\n * Licensed to the Apache Software Foundation (ASF) under one\n * or more contributor license agreements.  See the NOT"
  },
  {
    "path": "3rdparty/mshadow/mshadow-ps/.gitignore",
    "chars": 23,
    "preview": "Makefile\ntest\ntest.cpp\n"
  },
  {
    "path": "3rdparty/mshadow/mshadow-ps/README.md",
    "chars": 1082,
    "preview": "<!--- Licensed to the Apache Software Foundation (ASF) under one -->\n<!--- or more contributor license agreements.  See "
  },
  {
    "path": "3rdparty/mshadow/mshadow-ps/mshadow_ps.h",
    "chars": 13110,
    "preview": "/*\n * Licensed to the Apache Software Foundation (ASF) under one\n * or more contributor license agreements.  See the NOT"
  },
  {
    "path": "3rdparty/mshadow/mshadow-ps/ps_dist-inl.h",
    "chars": 4495,
    "preview": "/*\n * Licensed to the Apache Software Foundation (ASF) under one\n * or more contributor license agreements.  See the NOT"
  },
  {
    "path": "3rdparty/mshadow/mshadow-ps/ps_local-inl.h",
    "chars": 27044,
    "preview": "/*\n * Licensed to the Apache Software Foundation (ASF) under one\n * or more contributor license agreements.  See the NOT"
  },
  {
    "path": "3rdparty/mshadow/mshadow-ps/ps_rabit-inl.h",
    "chars": 4238,
    "preview": "/*\n * Licensed to the Apache Software Foundation (ASF) under one\n * or more contributor license agreements.  See the NOT"
  },
  {
    "path": "3rdparty/mshadow/mshadow-ps/thread.h",
    "chars": 6887,
    "preview": "/*\n * Licensed to the Apache Software Foundation (ASF) under one\n * or more contributor license agreements.  See the NOT"
  },
  {
    "path": "3rdparty/mshadow/mshadow-ps/thread_util.h",
    "chars": 5039,
    "preview": "/*\n * Licensed to the Apache Software Foundation (ASF) under one\n * or more contributor license agreements.  See the NOT"
  },
  {
    "path": "3rdparty/mshadow/scripts/travis_script.sh",
    "chars": 1337,
    "preview": "# Licensed to the Apache Software Foundation (ASF) under one\n# or more contributor license agreements.  See the NOTICE f"
  },
  {
    "path": "3rdparty/mshadow/test/Makefile",
    "chars": 1606,
    "preview": "# Licensed to the Apache Software Foundation (ASF) under one\n# or more contributor license agreements.  See the NOTICE f"
  },
  {
    "path": "3rdparty/mshadow/test/pairtest.cu",
    "chars": 4150,
    "preview": "/*\n * Licensed to the Apache Software Foundation (ASF) under one\n * or more contributor license agreements.  See the NOT"
  },
  {
    "path": "3rdparty/mshadow/test/pool.cu",
    "chars": 2912,
    "preview": "/*\n * Licensed to the Apache Software Foundation (ASF) under one\n * or more contributor license agreements.  See the NOT"
  },
  {
    "path": "3rdparty/mshadow/test/reshape.cu",
    "chars": 2914,
    "preview": "/*\n * Licensed to the Apache Software Foundation (ASF) under one\n * or more contributor license agreements.  See the NOT"
  },
  {
    "path": "3rdparty/mshadow/test/test.cu",
    "chars": 3129,
    "preview": "/*\n * Licensed to the Apache Software Foundation (ASF) under one\n * or more contributor license agreements.  See the NOT"
  },
  {
    "path": "3rdparty/mshadow/test/test.h",
    "chars": 2333,
    "preview": "/*\n * Licensed to the Apache Software Foundation (ASF) under one\n * or more contributor license agreements.  See the NOT"
  },
  {
    "path": "3rdparty/mshadow/test/unpack.cu",
    "chars": 3241,
    "preview": "/*\n * Licensed to the Apache Software Foundation (ASF) under one\n * or more contributor license agreements.  See the NOT"
  },
  {
    "path": "CMakeLists.txt",
    "chars": 43801,
    "preview": "# Licensed to the Apache Software Foundation (ASF) under one\n# or more contributor license agreements.  See the NOTICE f"
  },
  {
    "path": "CODEOWNERS",
    "chars": 2300,
    "preview": "# Watchers and contributors to Apache MXNet repo directories/packages/files\n# Please see documentation of use of CODEOWN"
  },
  {
    "path": "CODE_OF_CONDUCT.md",
    "chars": 2608,
    "preview": "<!---\n  Licensed to the Apache Software Foundation (ASF) under one\n  or more contributor license agreements.  See the NO"
  },
  {
    "path": "CONTRIBUTORS.md",
    "chars": 15308,
    "preview": "<!--\n  ~ Licensed to the Apache Software Foundation (ASF) under one\n  ~ or more contributor license agreements.  See the"
  },
  {
    "path": "DNNL_README.md",
    "chars": 1000,
    "preview": "<!--\n  ~ Licensed to the Apache Software Foundation (ASF) under one\n  ~ or more contributor license agreements.  See the"
  },
  {
    "path": "LICENSE",
    "chars": 19749,
    "preview": "\n                                 Apache License\n                           Version 2.0, January 2004\n                  "
  },
  {
    "path": "NEWS.md",
    "chars": 204724,
    "preview": "<!--\n  ~ Licensed to the Apache Software Foundation (ASF) under one\n  ~ or more contributor license agreements.  See the"
  },
  {
    "path": "NOTICE",
    "chars": 580,
    "preview": "    Apache MXNET\n    Copyright 2017-2023 The Apache Software Foundation\n\n    This product includes software developed at"
  },
  {
    "path": "README.md",
    "chars": 16698,
    "preview": "<!--\n  ~ Licensed to the Apache Software Foundation (ASF) under one\n  ~ or more contributor license agreements.  See the"
  },
  {
    "path": "SECURITY.md",
    "chars": 1299,
    "preview": "<!--\n  ~ Licensed to the Apache Software Foundation (ASF) under one\n  ~ or more contributor license agreements.  See the"
  },
  {
    "path": "benchmark/__init__.py",
    "chars": 785,
    "preview": "# Licensed to the Apache Software Foundation (ASF) under one\n# or more contributor license agreements.  See the NOTICE f"
  },
  {
    "path": "benchmark/opperf/README.md",
    "chars": 14246,
    "preview": "<!--\n  ~ Licensed to the Apache Software Foundation (ASF) under one\n  ~ or more contributor license agreements.  See the"
  },
  {
    "path": "benchmark/opperf/__init__.py",
    "chars": 785,
    "preview": "# Licensed to the Apache Software Foundation (ASF) under one\n# or more contributor license agreements.  See the NOTICE f"
  },
  {
    "path": "benchmark/opperf/custom_operations/__init__.py",
    "chars": 785,
    "preview": "# Licensed to the Apache Software Foundation (ASF) under one\n# or more contributor license agreements.  See the NOTICE f"
  },
  {
    "path": "benchmark/opperf/custom_operations/custom_operations.py",
    "chars": 2253,
    "preview": "# Licensed to the Apache Software Foundation (ASF) under one\n# or more contributor license agreements.  See the NOTICE f"
  },
  {
    "path": "benchmark/opperf/nd_operations/README.md",
    "chars": 1125,
    "preview": "<!--\n  ~ Licensed to the Apache Software Foundation (ASF) under one\n  ~ or more contributor license agreements.  See the"
  },
  {
    "path": "benchmark/opperf/nd_operations/__init__.py",
    "chars": 785,
    "preview": "# Licensed to the Apache Software Foundation (ASF) under one\n# or more contributor license agreements.  See the NOTICE f"
  },
  {
    "path": "benchmark/opperf/nd_operations/array_manipulation_operators.py",
    "chars": 10117,
    "preview": "# Licensed to the Apache Software Foundation (ASF) under one\n# or more contributor license agreements.  See the NOTICE f"
  },
  {
    "path": "benchmark/opperf/nd_operations/array_rearrange.py",
    "chars": 2262,
    "preview": "# Licensed to the Apache Software Foundation (ASF) under one\n# or more contributor license agreements.  See the NOTICE f"
  },
  {
    "path": "benchmark/opperf/nd_operations/binary_operators.py",
    "chars": 5490,
    "preview": "# Licensed to the Apache Software Foundation (ASF) under one\n# or more contributor license agreements.  See the NOTICE f"
  },
  {
    "path": "benchmark/opperf/nd_operations/gemm_operators.py",
    "chars": 5938,
    "preview": "# Licensed to the Apache Software Foundation (ASF) under one\n# or more contributor license agreements.  See the NOTICE f"
  },
  {
    "path": "benchmark/opperf/nd_operations/indexing_routines.py",
    "chars": 2314,
    "preview": "# Licensed to the Apache Software Foundation (ASF) under one\n# or more contributor license agreements.  See the NOTICE f"
  },
  {
    "path": "benchmark/opperf/nd_operations/linalg_operators.py",
    "chars": 3754,
    "preview": "# Licensed to the Apache Software Foundation (ASF) under one\n# or more contributor license agreements.  See the NOTICE f"
  },
  {
    "path": "benchmark/opperf/nd_operations/misc_operators.py",
    "chars": 7524,
    "preview": "# Licensed to the Apache Software Foundation (ASF) under one\n# or more contributor license agreements.  See the NOTICE f"
  },
  {
    "path": "benchmark/opperf/nd_operations/nn_activation_operators.py",
    "chars": 2525,
    "preview": "# Licensed to the Apache Software Foundation (ASF) under one\n# or more contributor license agreements.  See the NOTICE f"
  },
  {
    "path": "benchmark/opperf/nd_operations/nn_basic_operators.py",
    "chars": 6715,
    "preview": "# Licensed to the Apache Software Foundation (ASF) under one\n# or more contributor license agreements.  See the NOTICE f"
  },
  {
    "path": "benchmark/opperf/nd_operations/nn_conv_operators.py",
    "chars": 17997,
    "preview": "# Licensed to the Apache Software Foundation (ASF) under one\n# or more contributor license agreements.  See the NOTICE f"
  },
  {
    "path": "benchmark/opperf/nd_operations/nn_loss_operators.py",
    "chars": 2214,
    "preview": "# Licensed to the Apache Software Foundation (ASF) under one\n# or more contributor license agreements.  See the NOTICE f"
  },
  {
    "path": "benchmark/opperf/nd_operations/nn_optimizer_operators.py",
    "chars": 7831,
    "preview": "# Licensed to the Apache Software Foundation (ASF) under one\n# or more contributor license agreements.  See the NOTICE f"
  },
  {
    "path": "benchmark/opperf/nd_operations/random_sampling_operators.py",
    "chars": 2857,
    "preview": "# Licensed to the Apache Software Foundation (ASF) under one\n# or more contributor license agreements.  See the NOTICE f"
  },
  {
    "path": "benchmark/opperf/nd_operations/reduction_operators.py",
    "chars": 2519,
    "preview": "# Licensed to the Apache Software Foundation (ASF) under one\n# or more contributor license agreements.  See the NOTICE f"
  },
  {
    "path": "benchmark/opperf/nd_operations/sorting_searching_operators.py",
    "chars": 2297,
    "preview": "# Licensed to the Apache Software Foundation (ASF) under one\n# or more contributor license agreements.  See the NOTICE f"
  },
  {
    "path": "benchmark/opperf/nd_operations/unary_operators.py",
    "chars": 4170,
    "preview": "# Licensed to the Apache Software Foundation (ASF) under one\n# or more contributor license agreements.  See the NOTICE f"
  },
  {
    "path": "benchmark/opperf/opperf.py",
    "chars": 13600,
    "preview": "#!/usr/bin/env python3\n#\n# Licensed to the Apache Software Foundation (ASF) under one\n# or more contributor license agre"
  },
  {
    "path": "benchmark/opperf/results/mxnet_operator_benchmark_results_cpu.md",
    "chars": 24773,
    "preview": "<!--\n  ~ Licensed to the Apache Software Foundation (ASF) under one\n  ~ or more contributor license agreements.  See the"
  },
  {
    "path": "benchmark/opperf/results/mxnet_operator_benchmark_results_gpu.md",
    "chars": 24573,
    "preview": "<!--\n  ~ Licensed to the Apache Software Foundation (ASF) under one\n  ~ or more contributor license agreements.  See the"
  },
  {
    "path": "benchmark/opperf/rules/__init__.py",
    "chars": 785,
    "preview": "# Licensed to the Apache Software Foundation (ASF) under one\n# or more contributor license agreements.  See the NOTICE f"
  },
  {
    "path": "benchmark/opperf/rules/default_params.py",
    "chars": 39781,
    "preview": "# Licensed to the Apache Software Foundation (ASF) under one\n# or more contributor license agreements.  See the NOTICE f"
  },
  {
    "path": "benchmark/opperf/utils/__init__.py",
    "chars": 785,
    "preview": "# Licensed to the Apache Software Foundation (ASF) under one\n# or more contributor license agreements.  See the NOTICE f"
  },
  {
    "path": "benchmark/opperf/utils/benchmark_operators_pytest.py",
    "chars": 6113,
    "preview": "# Licensed to the Apache Software Foundation (ASF) under one\n# or more contributor license agreements.  See the NOTICE f"
  },
  {
    "path": "benchmark/opperf/utils/benchmark_utils.py",
    "chars": 13848,
    "preview": "# Licensed to the Apache Software Foundation (ASF) under one\n# or more contributor license agreements.  See the NOTICE f"
  },
  {
    "path": "benchmark/opperf/utils/common_utils.py",
    "chars": 5302,
    "preview": "# Licensed to the Apache Software Foundation (ASF) under one\n# or more contributor license agreements.  See the NOTICE f"
  },
  {
    "path": "benchmark/opperf/utils/ndarray_utils.py",
    "chars": 5230,
    "preview": "# Licensed to the Apache Software Foundation (ASF) under one\n# or more contributor license agreements.  See the NOTICE f"
  },
  {
    "path": "benchmark/opperf/utils/op_registry_utils.py",
    "chars": 24021,
    "preview": "# Licensed to the Apache Software Foundation (ASF) under one\n# or more contributor license agreements.  See the NOTICE f"
  },
  {
    "path": "benchmark/opperf/utils/profiler_utils.py",
    "chars": 11918,
    "preview": "# Licensed to the Apache Software Foundation (ASF) under one\n# or more contributor license agreements.  See the NOTICE f"
  },
  {
    "path": "benchmark/python/control_flow/rnn.py",
    "chars": 4928,
    "preview": "# Licensed to the Apache Software Foundation (ASF) under one\n# or more contributor license agreements.  See the NOTICE f"
  },
  {
    "path": "benchmark/python/dnnl/fc_add.py",
    "chars": 6980,
    "preview": "# Licensed to the Apache Software Foundation (ASF) under one\n# or more contributor license agreements.  See the NOTICE f"
  },
  {
    "path": "benchmark/python/dnnl/run.sh",
    "chars": 1806,
    "preview": "#!/bin/bash\n\n# Licensed to the Apache Software Foundation (ASF) under one\n# or more contributor license agreements.  See"
  }
]

// ... and 2443 more files (download for full content)

About this extraction

This page contains the full source code of the apache/mxnet GitHub repository, extracted and formatted as plain text for AI agents and large language models (LLMs). The extraction includes 2643 files (28.3 MB), approximately 4.1M tokens, and a symbol index with 7970 extracted functions, classes, methods, constants, and types. Use this with OpenClaw, Claude, ChatGPT, Cursor, Windsurf, or any other AI tool that accepts text input. You can copy the full output to your clipboard or download it as a .txt file.

Extracted by GitExtract — free GitHub repo to text converter for AI. Built by Nikandr Surkov.

Copied to clipboard!