Repository: apache/mxnet
Branch: master
Commit: b84609d3fc73
Files: 2643
Total size: 28.3 MB
Directory structure:
gitextract_zlms863u/
├── .asf.yaml
├── .clang-format
├── .clang-tidy
├── .cmakelintrc
├── .codecov.yml
├── .git-blame-ignore-revs
├── .gitattributes
├── .github/
│ ├── ISSUE_TEMPLATE/
│ │ ├── bug_report.md
│ │ ├── config.yml
│ │ ├── feature_request.md
│ │ ├── flaky_test.md
│ │ └── rfc.md
│ ├── PULL_REQUEST_TEMPLATE.md
│ └── workflows/
│ ├── greetings.yml
│ ├── license_check.yml
│ ├── link_check.yml
│ ├── os_x_mklbuild.yml
│ └── os_x_staticbuild.yml
├── .gitignore
├── .gitmodules
├── .licenserc.yaml
├── .mxnet_root
├── 3rdparty/
│ ├── ctc_include/
│ │ ├── LICENSE
│ │ ├── contrib/
│ │ │ └── moderngpu/
│ │ │ ├── LICENSE
│ │ │ └── include/
│ │ │ ├── device/
│ │ │ │ ├── ctaloadbalance.cuh
│ │ │ │ ├── ctamerge.cuh
│ │ │ │ ├── ctascan.cuh
│ │ │ │ ├── ctasearch.cuh
│ │ │ │ ├── ctasegreduce.cuh
│ │ │ │ ├── ctasegscan.cuh
│ │ │ │ ├── ctasegsort.cuh
│ │ │ │ ├── ctasortedsearch.cuh
│ │ │ │ ├── devicetypes.cuh
│ │ │ │ ├── deviceutil.cuh
│ │ │ │ ├── intrinsics.cuh
│ │ │ │ ├── loadstore.cuh
│ │ │ │ ├── serialsets.cuh
│ │ │ │ └── sortnetwork.cuh
│ │ │ ├── mgpudevice.cuh
│ │ │ ├── mgpuenums.h
│ │ │ └── util/
│ │ │ └── static.h
│ │ └── detail/
│ │ ├── cpu_ctc.h
│ │ ├── ctc_helper.h
│ │ ├── gpu_ctc.h
│ │ ├── gpu_ctc_kernels.h
│ │ └── hostdevice.h
│ ├── miniz/
│ │ ├── miniz.c
│ │ └── miniz.h
│ └── mshadow/
│ ├── .gitignore
│ ├── .travis.yml
│ ├── CHANGES.md
│ ├── CMakeLists.txt
│ ├── LICENSE
│ ├── README.md
│ ├── cmake/
│ │ └── AutoDetectF16C.cmake
│ ├── doc/
│ │ ├── Doxyfile
│ │ ├── README.md
│ │ └── mkdoc.sh
│ ├── guide/
│ │ ├── .gitignore
│ │ ├── Makefile
│ │ ├── README.md
│ │ ├── basic.cpp
│ │ ├── basic_stream.cu
│ │ ├── defop.cpp
│ │ ├── exp-template/
│ │ │ ├── .gitignore
│ │ │ ├── Makefile
│ │ │ └── README.md
│ │ ├── mshadow-ps/
│ │ │ ├── .gitignore
│ │ │ ├── Makefile
│ │ │ ├── README.md
│ │ │ ├── dbstr.h
│ │ │ ├── dist_async_sum-inl.h
│ │ │ ├── dist_async_sum.cpp
│ │ │ ├── local.sh
│ │ │ ├── local_sum-inl.h
│ │ │ ├── local_sum.cpp
│ │ │ └── local_sum.cu
│ │ └── neuralnet/
│ │ ├── Makefile
│ │ ├── README.md
│ │ ├── convnet.cu
│ │ ├── nnet.cu
│ │ ├── nnet_ps.cu
│ │ └── util.h
│ ├── make/
│ │ ├── README.md
│ │ └── mshadow.mk
│ ├── mshadow/
│ │ ├── README.md
│ │ ├── base.h
│ │ ├── bfloat.h
│ │ ├── cuda/
│ │ │ ├── reduce.cuh
│ │ │ └── tensor_gpu-inl.cuh
│ │ ├── dot_engine-inl.h
│ │ ├── expr_engine-inl.h
│ │ ├── expr_scalar-inl.h
│ │ ├── expression.h
│ │ ├── extension/
│ │ │ ├── broadcast.h
│ │ │ ├── broadcast_with_axis.h
│ │ │ ├── channel_pool.h
│ │ │ ├── channel_unpool.h
│ │ │ ├── choose.h
│ │ │ ├── complex.h
│ │ │ ├── concat.h
│ │ │ ├── crop.h
│ │ │ ├── fill.h
│ │ │ ├── flip.h
│ │ │ ├── implicit_gemm.h
│ │ │ ├── mask.h
│ │ │ ├── mirror.h
│ │ │ ├── one_hot.h
│ │ │ ├── pack_col2patch.h
│ │ │ ├── pad.h
│ │ │ ├── range.h
│ │ │ ├── reduce_with_axis.h
│ │ │ ├── reduceto1d.h
│ │ │ ├── reshape.h
│ │ │ ├── slice.h
│ │ │ ├── slice_ex.h
│ │ │ ├── spatial_pool.h
│ │ │ ├── spatial_unpool.h
│ │ │ ├── spatial_upsampling_nearest.h
│ │ │ ├── swapaxis.h
│ │ │ ├── take.h
│ │ │ ├── take_grad.h
│ │ │ ├── transpose.h
│ │ │ └── unpack_patch2col.h
│ │ ├── extension.h
│ │ ├── half.h
│ │ ├── io.h
│ │ ├── packet/
│ │ │ ├── plain-inl.h
│ │ │ └── sse-inl.h
│ │ ├── packet-inl.h
│ │ ├── random.h
│ │ ├── stream_gpu-inl.h
│ │ ├── tensor.h
│ │ ├── tensor_container.h
│ │ ├── tensor_cpu-inl.h
│ │ └── tensor_gpu-inl.h
│ ├── mshadow-ps/
│ │ ├── .gitignore
│ │ ├── README.md
│ │ ├── mshadow_ps.h
│ │ ├── ps_dist-inl.h
│ │ ├── ps_local-inl.h
│ │ ├── ps_rabit-inl.h
│ │ ├── thread.h
│ │ └── thread_util.h
│ ├── scripts/
│ │ └── travis_script.sh
│ └── test/
│ ├── Makefile
│ ├── pairtest.cu
│ ├── pool.cu
│ ├── reshape.cu
│ ├── test.cu
│ ├── test.h
│ └── unpack.cu
├── CMakeLists.txt
├── CODEOWNERS
├── CODE_OF_CONDUCT.md
├── CONTRIBUTORS.md
├── DNNL_README.md
├── LICENSE
├── NEWS.md
├── NOTICE
├── README.md
├── SECURITY.md
├── benchmark/
│ ├── __init__.py
│ ├── opperf/
│ │ ├── README.md
│ │ ├── __init__.py
│ │ ├── custom_operations/
│ │ │ ├── __init__.py
│ │ │ └── custom_operations.py
│ │ ├── nd_operations/
│ │ │ ├── README.md
│ │ │ ├── __init__.py
│ │ │ ├── array_manipulation_operators.py
│ │ │ ├── array_rearrange.py
│ │ │ ├── binary_operators.py
│ │ │ ├── gemm_operators.py
│ │ │ ├── indexing_routines.py
│ │ │ ├── linalg_operators.py
│ │ │ ├── misc_operators.py
│ │ │ ├── nn_activation_operators.py
│ │ │ ├── nn_basic_operators.py
│ │ │ ├── nn_conv_operators.py
│ │ │ ├── nn_loss_operators.py
│ │ │ ├── nn_optimizer_operators.py
│ │ │ ├── random_sampling_operators.py
│ │ │ ├── reduction_operators.py
│ │ │ ├── sorting_searching_operators.py
│ │ │ └── unary_operators.py
│ │ ├── opperf.py
│ │ ├── results/
│ │ │ ├── mxnet_operator_benchmark_results_cpu.md
│ │ │ └── mxnet_operator_benchmark_results_gpu.md
│ │ ├── rules/
│ │ │ ├── __init__.py
│ │ │ └── default_params.py
│ │ └── utils/
│ │ ├── __init__.py
│ │ ├── benchmark_operators_pytest.py
│ │ ├── benchmark_utils.py
│ │ ├── common_utils.py
│ │ ├── ndarray_utils.py
│ │ ├── op_registry_utils.py
│ │ └── profiler_utils.py
│ └── python/
│ ├── control_flow/
│ │ └── rnn.py
│ ├── dnnl/
│ │ ├── fc_add.py
│ │ ├── run.sh
│ │ └── run_per_thread.sh
│ ├── einsum/
│ │ └── benchmark_einsum.py
│ ├── ffi/
│ │ └── benchmark_ffi.py
│ ├── metric/
│ │ └── benchmark_metric.py
│ ├── quantization/
│ │ └── benchmark_op.py
│ ├── sparse/
│ │ ├── cast_storage.py
│ │ ├── dot.py
│ │ ├── memory_benchmark.py
│ │ ├── sparse_op.py
│ │ ├── updater.py
│ │ └── util.py
│ └── tvmop/
│ └── benchmark_tvmop.py
├── cd/
│ ├── Jenkinsfile_cd_pipeline
│ ├── Jenkinsfile_release_job
│ ├── Jenkinsfile_utils.groovy
│ ├── README.md
│ ├── mxnet_lib/
│ │ ├── Jenkins_pipeline.groovy
│ │ └── mxnet_lib_pipeline.groovy
│ ├── python/
│ │ ├── docker/
│ │ │ ├── Dockerfile
│ │ │ ├── Dockerfile.test
│ │ │ ├── Jenkins_pipeline.groovy
│ │ │ ├── python_images.sh
│ │ │ └── test_python_image.sh
│ │ └── pypi/
│ │ ├── Jenkins_pipeline.groovy
│ │ ├── README.md
│ │ ├── pypi_package.sh
│ │ └── pypi_publish.py
│ └── utils/
│ ├── artifact_repository.md
│ ├── artifact_repository.py
│ ├── docker_tag.sh
│ ├── mxnet_base_image.sh
│ └── test_artifact_repository.py
├── ci/
│ ├── Jenkinsfile_docker_cache
│ ├── Jenkinsfile_utils.groovy
│ ├── README.md
│ ├── __init__.py
│ ├── build.py
│ ├── build_windows.py
│ ├── dev_menu.py
│ ├── docker/
│ │ ├── Dockerfile.build.android
│ │ ├── Dockerfile.build.arm
│ │ ├── Dockerfile.build.centos7
│ │ ├── Dockerfile.build.jetson
│ │ ├── Dockerfile.build.ubuntu
│ │ ├── Dockerfile.build.ubuntu_cpu_jekyll
│ │ ├── Dockerfile.publish.test.centos7
│ │ ├── Dockerfile.test.arm
│ │ ├── docker-compose.yml
│ │ ├── install/
│ │ │ ├── deb_ubuntu_ccache.sh
│ │ │ ├── docker_filepermissions.sh
│ │ │ ├── requirements
│ │ │ └── ubuntu_adduser.sh
│ │ ├── runtime_functions.sh
│ │ └── toolchains/
│ │ ├── aarch64-linux-gnu-toolchain.cmake
│ │ └── arm-linux-gnueabihf-toolchain.cmake
│ ├── docker_login.py
│ ├── jenkins/
│ │ ├── Jenkins_steps.groovy
│ │ ├── Jenkinsfile_centos_cpu
│ │ ├── Jenkinsfile_centos_gpu
│ │ ├── Jenkinsfile_clang
│ │ ├── Jenkinsfile_edge
│ │ ├── Jenkinsfile_full
│ │ ├── Jenkinsfile_miscellaneous
│ │ ├── Jenkinsfile_sanity
│ │ ├── Jenkinsfile_tools
│ │ ├── Jenkinsfile_unix_cpu
│ │ ├── Jenkinsfile_unix_gpu
│ │ ├── Jenkinsfile_website_beta
│ │ ├── Jenkinsfile_website_full
│ │ ├── Jenkinsfile_website_full_pr
│ │ ├── Jenkinsfile_website_jekyll_docs
│ │ ├── Jenkinsfile_website_mxnet_build
│ │ ├── Jenkinsfile_website_nightly
│ │ ├── Jenkinsfile_website_python_docs
│ │ ├── Jenkinsfile_website_version_artifacts
│ │ ├── Jenkinsfile_windows_cpu
│ │ └── Jenkinsfile_windows_gpu
│ ├── logging.conf
│ ├── other/
│ │ └── ci_deploy_doc.sh
│ ├── publish/
│ │ ├── Jenkinsfile
│ │ ├── README.md
│ │ ├── python/
│ │ │ └── build.sh
│ │ ├── scala/
│ │ │ ├── build.sh
│ │ │ ├── buildkey.py
│ │ │ ├── deploy.sh
│ │ │ ├── fullDeploy.sh
│ │ │ └── test.sh
│ │ └── website/
│ │ ├── README.md
│ │ ├── beta-deploy.sh
│ │ ├── deploy.sh
│ │ └── publish_artifacts.sh
│ ├── test_docker_login.py
│ ├── util.py
│ └── windows/
│ ├── test_py3_cpu.ps1
│ └── test_py3_gpu.ps1
├── cmake/
│ ├── BuildCythonModules.cmake
│ ├── BuildTVM.cmake
│ ├── ChooseBlas.cmake
│ ├── Modules/
│ │ ├── FindAccelerate.cmake
│ │ ├── FindAtlas.cmake
│ │ ├── FindCUDNN.cmake
│ │ ├── FindCUTENSOR.cmake
│ │ ├── FindGperftools.cmake
│ │ ├── FindJeMalloc.cmake
│ │ ├── FindNCCL.cmake
│ │ ├── FindNVML.cmake
│ │ ├── FindNVTX.cmake
│ │ └── FindOpenBLAS.cmake
│ ├── Utils.cmake
│ ├── libmxnet.sym
│ └── upstream/
│ ├── FindBLAS.cmake
│ ├── FindCUDAToolkit.cmake
│ └── select_compute_arch.cmake
├── config/
│ ├── darwin.cmake
│ ├── distribution/
│ │ ├── darwin_cpu.cmake
│ │ ├── darwin_cpu_mkl.cmake
│ │ ├── darwin_native.cmake
│ │ ├── linux_cpu.cmake
│ │ ├── linux_cpu_mkl.cmake
│ │ ├── linux_cu100.cmake
│ │ ├── linux_cu101.cmake
│ │ ├── linux_cu102.cmake
│ │ ├── linux_cu110.cmake
│ │ ├── linux_cu112.cmake
│ │ ├── linux_cu92.cmake
│ │ └── linux_native.cmake
│ ├── linux.cmake
│ └── linux_gpu.cmake
├── conftest.py
├── contrib/
│ └── tvmop/
│ ├── __init__.py
│ ├── basic/
│ │ ├── __init__.py
│ │ └── ufunc.py
│ ├── compile.py
│ ├── core/
│ │ ├── __init__.py
│ │ ├── fromnumeric.py
│ │ ├── multiarray.py
│ │ └── umath.py
│ ├── opdef.py
│ ├── space.py
│ └── utils.py
├── cpp-package/
│ ├── CMakeLists.txt
│ ├── README.md
│ ├── example/
│ │ ├── CMakeLists.txt
│ │ ├── README.md
│ │ ├── alexnet.cpp
│ │ ├── charRNN.cpp
│ │ ├── feature_extract/
│ │ │ ├── README.md
│ │ │ ├── feature_extract.cpp
│ │ │ ├── prepare_data_with_opencv.cpp
│ │ │ └── run.sh
│ │ ├── get_data.sh
│ │ ├── googlenet.cpp
│ │ ├── inception_bn.cpp
│ │ ├── inference/
│ │ │ ├── CMakeLists.txt
│ │ │ ├── README.md
│ │ │ ├── imagenet_inference.cpp
│ │ │ ├── multi_threaded_inference/
│ │ │ │ ├── get_model.py
│ │ │ │ ├── multi_threaded_inference.cc
│ │ │ │ └── unit_test_multi_threaded_inference.sh
│ │ │ ├── sentiment_analysis_rnn.cpp
│ │ │ ├── unit_test_imagenet_inference.sh
│ │ │ └── unit_test_sentiment_analysis_rnn.sh
│ │ ├── lenet.cpp
│ │ ├── lenet_with_mxdataiter.cpp
│ │ ├── mlp.cpp
│ │ ├── mlp_cpu.cpp
│ │ ├── mlp_csv.cpp
│ │ ├── mlp_gpu.cpp
│ │ ├── mnist_to_csv.py
│ │ ├── resnet.cpp
│ │ ├── run_lenet_with_mxdataiter.sh
│ │ ├── test_kvstore.cpp
│ │ ├── test_ndarray_copy.cpp
│ │ ├── test_optimizer.cpp
│ │ ├── test_regress_label.cpp
│ │ ├── test_score.cpp
│ │ ├── unittests/
│ │ │ └── unit_test_mlp_csv.sh
│ │ └── utils.h
│ ├── include/
│ │ └── mxnet-cpp/
│ │ ├── .gitignore
│ │ ├── CPPLINT.cfg
│ │ ├── MxNetCpp.h
│ │ ├── base.h
│ │ ├── contrib.h
│ │ ├── executor.h
│ │ ├── executor.hpp
│ │ ├── initializer.h
│ │ ├── io.h
│ │ ├── io.hpp
│ │ ├── kvstore.h
│ │ ├── kvstore.hpp
│ │ ├── lr_scheduler.h
│ │ ├── metric.h
│ │ ├── model.h
│ │ ├── ndarray.h
│ │ ├── ndarray.hpp
│ │ ├── op_map.h
│ │ ├── op_suppl.h
│ │ ├── op_util.h
│ │ ├── operator.h
│ │ ├── operator.hpp
│ │ ├── optimizer.h
│ │ ├── optimizer.hpp
│ │ ├── shape.h
│ │ ├── symbol.h
│ │ └── symbol.hpp
│ ├── scripts/
│ │ ├── OpWrapperGenerator.py
│ │ └── lint.py
│ └── tests/
│ └── ci_test.sh
├── doap.rdf
├── docker/
│ ├── .gitignore
│ ├── Dockerfiles/
│ │ ├── Dockerfile.in.julia
│ │ ├── Dockerfile.in.lib.cpu
│ │ ├── Dockerfile.in.lib.gpu
│ │ ├── Dockerfile.in.perl
│ │ ├── Dockerfile.in.python
│ │ ├── Dockerfile.in.r-lang
│ │ └── Dockerfile.in.scala
│ ├── README.md
│ ├── docker-python/
│ │ ├── README.md
│ │ ├── build_python_dockerfile.sh
│ │ └── test_mxnet.py
│ ├── install/
│ │ ├── cpp.sh
│ │ ├── julia.sh
│ │ ├── perl.sh
│ │ ├── python.sh
│ │ ├── r.sh
│ │ └── scala.sh
│ ├── run.sh
│ └── tool.sh
├── docs/
│ ├── .dockerignore
│ ├── .gitignore
│ ├── README.md
│ ├── cpp_docs/
│ │ ├── Doxyfile
│ │ └── Makefile
│ ├── python_docs/
│ │ ├── README.md
│ │ ├── _static/
│ │ │ ├── autodoc.js
│ │ │ ├── feedback.css
│ │ │ ├── matomo_analytics.js
│ │ │ └── mxnet.css
│ │ ├── python/
│ │ │ ├── .gitignore
│ │ │ ├── Makefile
│ │ │ ├── Makefile_sphinx
│ │ │ ├── api/
│ │ │ │ ├── autograd/
│ │ │ │ │ └── index.rst
│ │ │ │ ├── contrib/
│ │ │ │ │ ├── index.rst
│ │ │ │ │ ├── io/
│ │ │ │ │ │ └── index.rst
│ │ │ │ │ ├── ndarray/
│ │ │ │ │ │ └── index.rst
│ │ │ │ │ ├── onnx/
│ │ │ │ │ │ └── index.rst
│ │ │ │ │ ├── quantization/
│ │ │ │ │ │ └── index.rst
│ │ │ │ │ ├── symbol/
│ │ │ │ │ │ └── index.rst
│ │ │ │ │ ├── tensorboard/
│ │ │ │ │ │ └── index.rst
│ │ │ │ │ ├── tensorrt/
│ │ │ │ │ │ └── index.rst
│ │ │ │ │ └── text/
│ │ │ │ │ └── index.rst
│ │ │ │ ├── device/
│ │ │ │ │ └── index.rst
│ │ │ │ ├── engine/
│ │ │ │ │ └── index.rst
│ │ │ │ ├── executor/
│ │ │ │ │ └── index.rst
│ │ │ │ ├── gluon/
│ │ │ │ │ ├── block.rst
│ │ │ │ │ ├── constant.rst
│ │ │ │ │ ├── contrib/
│ │ │ │ │ │ └── index.rst
│ │ │ │ │ ├── hybrid_block.rst
│ │ │ │ │ ├── index.rst
│ │ │ │ │ ├── loss/
│ │ │ │ │ │ └── index.rst
│ │ │ │ │ ├── metric/
│ │ │ │ │ │ └── index.rst
│ │ │ │ │ ├── model_zoo/
│ │ │ │ │ │ └── index.rst
│ │ │ │ │ ├── nn/
│ │ │ │ │ │ └── index.rst
│ │ │ │ │ ├── parameter.rst
│ │ │ │ │ ├── rnn/
│ │ │ │ │ │ └── index.rst
│ │ │ │ │ ├── symbol_block.rst
│ │ │ │ │ ├── trainer.rst
│ │ │ │ │ └── utils/
│ │ │ │ │ └── index.rst
│ │ │ │ ├── index.rst
│ │ │ │ ├── initializer/
│ │ │ │ │ └── index.rst
│ │ │ │ ├── kvstore/
│ │ │ │ │ └── index.rst
│ │ │ │ ├── kvstore_server/
│ │ │ │ │ └── index.rst
│ │ │ │ ├── legacy/
│ │ │ │ │ ├── callback/
│ │ │ │ │ │ └── index.rst
│ │ │ │ │ ├── image/
│ │ │ │ │ │ └── index.rst
│ │ │ │ │ ├── index.rst
│ │ │ │ │ ├── io/
│ │ │ │ │ │ └── index.rst
│ │ │ │ │ ├── ndarray/
│ │ │ │ │ │ ├── contrib/
│ │ │ │ │ │ │ └── index.rst
│ │ │ │ │ │ ├── image/
│ │ │ │ │ │ │ └── index.rst
│ │ │ │ │ │ ├── index.rst
│ │ │ │ │ │ ├── linalg/
│ │ │ │ │ │ │ └── index.rst
│ │ │ │ │ │ ├── ndarray.rst
│ │ │ │ │ │ ├── op/
│ │ │ │ │ │ │ └── index.rst
│ │ │ │ │ │ ├── random/
│ │ │ │ │ │ │ └── index.rst
│ │ │ │ │ │ ├── register/
│ │ │ │ │ │ │ └── index.rst
│ │ │ │ │ │ ├── sparse/
│ │ │ │ │ │ │ └── index.rst
│ │ │ │ │ │ └── utils/
│ │ │ │ │ │ └── index.rst
│ │ │ │ │ ├── recordio/
│ │ │ │ │ │ └── index.rst
│ │ │ │ │ ├── symbol/
│ │ │ │ │ │ ├── contrib/
│ │ │ │ │ │ │ └── index.rst
│ │ │ │ │ │ ├── image/
│ │ │ │ │ │ │ └── index.rst
│ │ │ │ │ │ ├── index.rst
│ │ │ │ │ │ ├── linalg/
│ │ │ │ │ │ │ └── index.rst
│ │ │ │ │ │ ├── op/
│ │ │ │ │ │ │ └── index.rst
│ │ │ │ │ │ ├── random/
│ │ │ │ │ │ │ └── index.rst
│ │ │ │ │ │ ├── register/
│ │ │ │ │ │ │ └── index.rst
│ │ │ │ │ │ ├── sparse/
│ │ │ │ │ │ │ └── index.rst
│ │ │ │ │ │ └── symbol.rst
│ │ │ │ │ └── visualization/
│ │ │ │ │ └── index.rst
│ │ │ │ ├── lr_scheduler/
│ │ │ │ │ └── index.rst
│ │ │ │ ├── np/
│ │ │ │ │ ├── arrays.indexing.rst
│ │ │ │ │ ├── arrays.ndarray.rst
│ │ │ │ │ ├── arrays.rst
│ │ │ │ │ ├── index.rst
│ │ │ │ │ ├── random/
│ │ │ │ │ │ └── index.rst
│ │ │ │ │ ├── routines.array-creation.rst
│ │ │ │ │ ├── routines.array-manipulation.rst
│ │ │ │ │ ├── routines.io.rst
│ │ │ │ │ ├── routines.linalg.rst
│ │ │ │ │ ├── routines.math.rst
│ │ │ │ │ ├── routines.rst
│ │ │ │ │ ├── routines.sort.rst
│ │ │ │ │ └── routines.statistics.rst
│ │ │ │ ├── npx/
│ │ │ │ │ └── index.rst
│ │ │ │ ├── optimizer/
│ │ │ │ │ └── index.rst
│ │ │ │ ├── profiler/
│ │ │ │ │ └── index.rst
│ │ │ │ ├── rtc/
│ │ │ │ │ └── index.rst
│ │ │ │ ├── runtime/
│ │ │ │ │ └── index.rst
│ │ │ │ ├── test_utils/
│ │ │ │ │ └── index.rst
│ │ │ │ └── util/
│ │ │ │ └── index.rst
│ │ │ ├── index.rst
│ │ │ ├── scripts/
│ │ │ │ ├── conf.py
│ │ │ │ ├── md2ipynb.py
│ │ │ │ └── process_rst.py
│ │ │ └── tutorials/
│ │ │ ├── deploy/
│ │ │ │ ├── export/
│ │ │ │ │ ├── index.rst
│ │ │ │ │ └── onnx.md
│ │ │ │ ├── index.rst
│ │ │ │ ├── inference/
│ │ │ │ │ ├── cpp.rst
│ │ │ │ │ ├── image_classification_jetson.md
│ │ │ │ │ └── index.rst
│ │ │ │ └── run-on-aws/
│ │ │ │ ├── cloud.md
│ │ │ │ ├── index.rst
│ │ │ │ ├── use_ec2.rst
│ │ │ │ └── use_sagemaker.rst
│ │ │ ├── extend/
│ │ │ │ ├── customop.md
│ │ │ │ └── index.rst
│ │ │ ├── getting-started/
│ │ │ │ ├── crash-course/
│ │ │ │ │ ├── 0-introduction.md
│ │ │ │ │ ├── 1-nparray.md
│ │ │ │ │ ├── 2-create-nn.md
│ │ │ │ │ ├── 3-autograd.md
│ │ │ │ │ ├── 4-components.md
│ │ │ │ │ ├── 5-datasets.md
│ │ │ │ │ ├── 6-train-nn.md
│ │ │ │ │ ├── 7-use-gpus.md
│ │ │ │ │ ├── index.rst
│ │ │ │ │ └── prepare_dataset.py
│ │ │ │ ├── gluon_from_experiment_to_deployment.md
│ │ │ │ ├── gluon_migration_guide.md
│ │ │ │ ├── index.rst
│ │ │ │ ├── logistic_regression_explained.md
│ │ │ │ └── to-mxnet/
│ │ │ │ ├── index.rst
│ │ │ │ └── pytorch.md
│ │ │ ├── index.rst
│ │ │ ├── packages/
│ │ │ │ ├── autograd/
│ │ │ │ │ └── index.md
│ │ │ │ ├── gluon/
│ │ │ │ │ ├── blocks/
│ │ │ │ │ │ ├── activations/
│ │ │ │ │ │ │ └── activations.md
│ │ │ │ │ │ ├── custom-layer.md
│ │ │ │ │ │ ├── hybridize.md
│ │ │ │ │ │ ├── index.rst
│ │ │ │ │ │ ├── init.md
│ │ │ │ │ │ ├── naming.md
│ │ │ │ │ │ ├── nn.md
│ │ │ │ │ │ ├── parameters.md
│ │ │ │ │ │ └── save_load_params.md
│ │ │ │ │ ├── image/
│ │ │ │ │ │ ├── index.rst
│ │ │ │ │ │ ├── info_gan.md
│ │ │ │ │ │ └── mnist.md
│ │ │ │ │ ├── index.rst
│ │ │ │ │ ├── loss/
│ │ │ │ │ │ ├── custom-loss.md
│ │ │ │ │ │ ├── index.rst
│ │ │ │ │ │ ├── kl_divergence.md
│ │ │ │ │ │ └── loss.md
│ │ │ │ │ ├── text/
│ │ │ │ │ │ ├── gnmt.rst
│ │ │ │ │ │ ├── index.rst
│ │ │ │ │ │ └── transformer.rst
│ │ │ │ │ └── training/
│ │ │ │ │ ├── fit_api_tutorial.md
│ │ │ │ │ ├── index.rst
│ │ │ │ │ ├── learning_rates/
│ │ │ │ │ │ ├── index.rst
│ │ │ │ │ │ ├── learning_rate_finder.md
│ │ │ │ │ │ ├── learning_rate_schedules.md
│ │ │ │ │ │ └── learning_rate_schedules_advanced.md
│ │ │ │ │ ├── normalization/
│ │ │ │ │ │ └── index.md
│ │ │ │ │ └── trainer.md
│ │ │ │ ├── index.rst
│ │ │ │ ├── kvstore/
│ │ │ │ │ ├── index.rst
│ │ │ │ │ └── kvstore.md
│ │ │ │ ├── legacy/
│ │ │ │ │ ├── index.rst
│ │ │ │ │ └── ndarray/
│ │ │ │ │ ├── 01-ndarray-intro.md
│ │ │ │ │ ├── 02-ndarray-operations.md
│ │ │ │ │ ├── 03-ndarray-contexts.md
│ │ │ │ │ ├── gotchas_numpy_in_mxnet.md
│ │ │ │ │ ├── index.rst
│ │ │ │ │ └── sparse/
│ │ │ │ │ ├── csr.md
│ │ │ │ │ ├── index.rst
│ │ │ │ │ └── row_sparse.md
│ │ │ │ ├── np/
│ │ │ │ │ ├── cheat-sheet.md
│ │ │ │ │ ├── index.rst
│ │ │ │ │ └── np-vs-numpy.md
│ │ │ │ ├── onnx/
│ │ │ │ │ ├── fine_tuning_gluon.md
│ │ │ │ │ ├── index.rst
│ │ │ │ │ └── inference_on_onnx_model.md
│ │ │ │ ├── optimizer/
│ │ │ │ │ └── index.md
│ │ │ │ └── viz/
│ │ │ │ └── index.rst
│ │ │ └── performance/
│ │ │ ├── backend/
│ │ │ │ ├── amp.md
│ │ │ │ ├── dnnl/
│ │ │ │ │ ├── dnnl_quantization.md
│ │ │ │ │ ├── dnnl_quantization_inc.md
│ │ │ │ │ ├── dnnl_readme.md
│ │ │ │ │ └── index.rst
│ │ │ │ ├── index.rst
│ │ │ │ ├── profiler.md
│ │ │ │ └── tvm.rst
│ │ │ ├── compression/
│ │ │ │ ├── index.rst
│ │ │ │ └── int8.rst
│ │ │ └── index.rst
│ │ ├── requirements
│ │ └── themes/
│ │ ├── .babelrc
│ │ ├── .circleci/
│ │ │ └── config.yml
│ │ ├── .gitignore
│ │ ├── .sassrc
│ │ └── mx-theme/
│ │ ├── LICENSE
│ │ ├── MANIFEST.in
│ │ ├── README.md
│ │ ├── mxtheme/
│ │ │ ├── __init__.py
│ │ │ ├── card.py
│ │ │ ├── drawer.html
│ │ │ ├── feedback.html
│ │ │ ├── footer.html
│ │ │ ├── header.html
│ │ │ ├── header_search.html
│ │ │ ├── header_sourcelink.html
│ │ │ ├── header_top.html
│ │ │ ├── layout.html
│ │ │ ├── localtoc.html
│ │ │ ├── relations.html
│ │ │ ├── search.html
│ │ │ ├── static/
│ │ │ │ ├── fontawesome/
│ │ │ │ │ └── all.css
│ │ │ │ ├── fonts.css
│ │ │ │ ├── sphinx_materialdesign_theme.css
│ │ │ │ └── sphinx_materialdesign_theme.js
│ │ │ └── theme.conf
│ │ ├── setup.py
│ │ └── src/
│ │ ├── js/
│ │ │ ├── adjust-height.js
│ │ │ ├── feedback.js
│ │ │ ├── scrollspy.js
│ │ │ └── sphinx_materialdesign_theme.js
│ │ └── scss/
│ │ ├── _root.scss
│ │ ├── _variables.scss
│ │ ├── admonitions/
│ │ │ └── _admonitions.scss
│ │ ├── blockquote/
│ │ │ └── _blockquote.scss
│ │ ├── card/
│ │ │ └── _card.scss
│ │ ├── code/
│ │ │ └── _code.scss
│ │ ├── downloadlink/
│ │ │ └── _downloadlink.scss
│ │ ├── drawer/
│ │ │ └── _drawer.scss
│ │ ├── fonts/
│ │ │ └── _material-icons.scss
│ │ ├── footer/
│ │ │ └── _footer.scss
│ │ ├── grid/
│ │ │ └── _simplegrid.scss
│ │ ├── header/
│ │ │ └── _header.scss
│ │ ├── headerings/
│ │ │ └── _headerings.scss
│ │ ├── layout/
│ │ │ └── _layout.scss
│ │ ├── lists/
│ │ │ └── _lists.scss
│ │ ├── search/
│ │ │ └── _search.scss
│ │ ├── sphinx_materialdesign_theme.scss
│ │ ├── tables/
│ │ │ └── _tables.scss
│ │ └── toc/
│ │ ├── _globaltoc.scss
│ │ ├── _localtoc.scss
│ │ └── _toctree.scss
│ ├── static_site/
│ │ ├── .gitignore
│ │ ├── .nojekyll
│ │ ├── Makefile
│ │ ├── README.md
│ │ └── src/
│ │ ├── .asf.yaml
│ │ ├── .gitignore
│ │ ├── .htaccess
│ │ ├── .nojekyll
│ │ ├── 404.html
│ │ ├── Gemfile
│ │ ├── _config.yml
│ │ ├── _config_beta.yml
│ │ ├── _config_prod.yml
│ │ ├── _includes/
│ │ │ ├── callout.html
│ │ │ ├── disqus_comments.html
│ │ │ ├── feedback.html
│ │ │ ├── footer.html
│ │ │ ├── get_started/
│ │ │ │ ├── cloud/
│ │ │ │ │ ├── cpu.md
│ │ │ │ │ └── gpu.md
│ │ │ │ ├── devices/
│ │ │ │ │ ├── nvidia-jetson.md
│ │ │ │ │ └── raspberry_pi.md
│ │ │ │ ├── get_started.html
│ │ │ │ ├── gpu_snippet.md
│ │ │ │ ├── linux/
│ │ │ │ │ ├── clojure/
│ │ │ │ │ │ └── build-from-source.md
│ │ │ │ │ ├── cpp/
│ │ │ │ │ │ └── build-from-source.md
│ │ │ │ │ ├── java/
│ │ │ │ │ │ └── build-from-source.md
│ │ │ │ │ ├── julia/
│ │ │ │ │ │ └── build-from-source.md
│ │ │ │ │ ├── perl/
│ │ │ │ │ │ └── build-from-source.md
│ │ │ │ │ ├── python/
│ │ │ │ │ │ ├── cpu/
│ │ │ │ │ │ │ ├── build-from-source.md
│ │ │ │ │ │ │ ├── docker.md
│ │ │ │ │ │ │ └── pip.md
│ │ │ │ │ │ └── gpu/
│ │ │ │ │ │ ├── build-from-source.md
│ │ │ │ │ │ ├── docker.md
│ │ │ │ │ │ └── pip.md
│ │ │ │ │ ├── r/
│ │ │ │ │ │ └── build-from-source.md
│ │ │ │ │ └── scala/
│ │ │ │ │ └── build-from-source.md
│ │ │ │ └── pip_snippet.md
│ │ │ ├── head.html
│ │ │ ├── header.html
│ │ │ ├── icon-github.html
│ │ │ ├── icon-twitter.html
│ │ │ ├── important.html
│ │ │ ├── matomo-analytics.html
│ │ │ ├── note.html
│ │ │ ├── social.html
│ │ │ ├── tip.html
│ │ │ └── warning.html
│ │ ├── _layouts/
│ │ │ ├── default.html
│ │ │ ├── home.html
│ │ │ ├── page.html
│ │ │ ├── page_api.html
│ │ │ ├── page_category.html
│ │ │ ├── page_landing_tutorials.html
│ │ │ └── post.html
│ │ ├── _plugins/
│ │ │ └── markdowner.rb
│ │ ├── _sass/
│ │ │ ├── feedback.scss
│ │ │ ├── generalVersionDropdown.scss
│ │ │ ├── globalSearch.scss
│ │ │ ├── minima/
│ │ │ │ ├── _base.scss
│ │ │ │ ├── _blog.scss
│ │ │ │ ├── _docs.scss
│ │ │ │ ├── _ecosystem.scss
│ │ │ │ ├── _features.scss
│ │ │ │ ├── _getting_started.scss
│ │ │ │ ├── _home.scss
│ │ │ │ ├── _layout.scss
│ │ │ │ ├── _syntax-highlighting.scss
│ │ │ │ ├── colorful.scss
│ │ │ │ └── simple-grid.scss
│ │ │ └── minima.scss
│ │ ├── assets/
│ │ │ ├── js/
│ │ │ │ ├── clipboard.js
│ │ │ │ ├── copycode.js
│ │ │ │ ├── feedback.js
│ │ │ │ ├── globalSearch.js
│ │ │ │ └── options.js
│ │ │ └── main.scss
│ │ ├── index.html
│ │ └── pages/
│ │ ├── api/
│ │ │ ├── api.html
│ │ │ ├── architecture/
│ │ │ │ ├── exception_handling.md
│ │ │ │ ├── note_data_loading.md
│ │ │ │ ├── note_engine.md
│ │ │ │ ├── note_memory.md
│ │ │ │ ├── overview.md
│ │ │ │ └── program_model.md
│ │ │ ├── clojure/
│ │ │ │ ├── docs/
│ │ │ │ │ └── tutorials/
│ │ │ │ │ ├── index.md
│ │ │ │ │ ├── kvstore.md
│ │ │ │ │ ├── module.md
│ │ │ │ │ ├── ndarray.md
│ │ │ │ │ ├── symbol.md
│ │ │ │ │ └── symbol_in_pictures.md
│ │ │ │ └── index.md
│ │ │ ├── cpp/
│ │ │ │ ├── docs/
│ │ │ │ │ └── tutorials/
│ │ │ │ │ ├── basics.md
│ │ │ │ │ ├── index.md
│ │ │ │ │ ├── multi_threaded_inference.md
│ │ │ │ │ ├── mxnet_cpp_inference_tutorial.md
│ │ │ │ │ └── subgraphAPI.md
│ │ │ │ └── index.md
│ │ │ ├── developer_guide/
│ │ │ │ ├── 1_github_contribution_and_PR_verification_tips.md
│ │ │ │ ├── debugging_and_performance_optimization_tips.md
│ │ │ │ ├── examine_forward_results_with_hooks.md
│ │ │ │ ├── exception_handing_and_custom_error_types.md
│ │ │ │ └── profiling.md
│ │ │ ├── faq/
│ │ │ │ ├── add_op_in_backend.md
│ │ │ │ ├── cloud.md
│ │ │ │ ├── distributed_training.md
│ │ │ │ ├── env_var.md
│ │ │ │ ├── float16.md
│ │ │ │ ├── gradient_compression.md
│ │ │ │ ├── large_tensor_support.md
│ │ │ │ ├── model_parallel_lstm.md
│ │ │ │ ├── new_op.md
│ │ │ │ ├── perf.md
│ │ │ │ ├── recordio.md
│ │ │ │ ├── s3_integration.md
│ │ │ │ ├── security.md
│ │ │ │ ├── tensor_inspector_tutorial.md
│ │ │ │ ├── using_rtc.md
│ │ │ │ └── why_mxnet.md
│ │ │ ├── java/
│ │ │ │ ├── docs/
│ │ │ │ │ └── tutorials/
│ │ │ │ │ ├── index.md
│ │ │ │ │ └── ssd_inference.md
│ │ │ │ └── index.md
│ │ │ ├── julia/
│ │ │ │ └── index.md
│ │ │ ├── perl/
│ │ │ │ ├── docs/
│ │ │ │ │ └── tutorials/
│ │ │ │ │ ├── index.md
│ │ │ │ │ ├── io.md
│ │ │ │ │ ├── kvstore.md
│ │ │ │ │ ├── ndarray.md
│ │ │ │ │ └── symbol.md
│ │ │ │ └── index.md
│ │ │ ├── python/
│ │ │ │ └── index.md
│ │ │ ├── r/
│ │ │ │ ├── docs/
│ │ │ │ │ └── tutorials/
│ │ │ │ │ ├── char_rnn_model.md
│ │ │ │ │ ├── classify_real_image_with_pretrained_model.md
│ │ │ │ │ ├── custom_iterator.md
│ │ │ │ │ ├── index.md
│ │ │ │ │ ├── multi_dim_lstm.md
│ │ │ │ │ ├── ndarray.md
│ │ │ │ │ └── symbol.md
│ │ │ │ └── index.md
│ │ │ └── scala/
│ │ │ ├── docs/
│ │ │ │ └── tutorials/
│ │ │ │ ├── index.md
│ │ │ │ ├── infer.md
│ │ │ │ ├── io.md
│ │ │ │ ├── kvstore.md
│ │ │ │ ├── ndarray.md
│ │ │ │ ├── symbol.md
│ │ │ │ └── symbol_in_pictures.md
│ │ │ └── index.md
│ │ ├── community/
│ │ │ ├── clang_format_guide.md
│ │ │ ├── code_guide.md
│ │ │ ├── code_review.md
│ │ │ ├── committer_guide.md
│ │ │ ├── community.md
│ │ │ ├── document.md
│ │ │ ├── error_handling.md
│ │ │ ├── git_howto.md
│ │ │ ├── index.md
│ │ │ └── pull_request.md
│ │ ├── ecosystem.html
│ │ ├── features.html
│ │ ├── get_started/
│ │ │ ├── build_from_source.md
│ │ │ ├── download.md
│ │ │ ├── index.html
│ │ │ ├── jetson_setup.md
│ │ │ └── validate_mxnet.md
│ │ └── trusted_by.html
│ └── tutorial_utils/
│ └── vision/
│ └── cnn_visualization/
│ └── gradcam.py
├── example/
│ ├── MXNetTutorialTemplate.ipynb
│ ├── README.md
│ ├── adversary/
│ │ ├── README.md
│ │ └── adversary_generation.ipynb
│ ├── bi-lstm-sort/
│ │ ├── README.md
│ │ └── bi-lstm-sort.ipynb
│ ├── distributed_training/
│ │ ├── README.md
│ │ ├── cifar10_dist.py
│ │ └── cifar10_kvstore_hvd.py
│ ├── distributed_training-horovod/
│ │ ├── README.md
│ │ ├── gluon_mnist.py
│ │ └── resnet50_imagenet.py
│ ├── extensions/
│ │ ├── lib_api/
│ │ │ ├── Makefile
│ │ │ ├── init_lib.cc
│ │ │ ├── libtest.cc
│ │ │ └── test_loading.py
│ │ ├── lib_custom_op/
│ │ │ ├── Makefile
│ │ │ ├── README.md
│ │ │ ├── gemm_lib.cc
│ │ │ ├── relu_lib.cc
│ │ │ ├── relu_lib.cu
│ │ │ ├── relu_lib.h
│ │ │ ├── test_gemm.py
│ │ │ ├── test_relu.py
│ │ │ ├── test_transposecsr.py
│ │ │ ├── test_transposerowsp.py
│ │ │ ├── transposecsr_lib.cc
│ │ │ └── transposerowsp_lib.cc
│ │ ├── lib_external_ops/
│ │ │ ├── CMakeLists.txt
│ │ │ ├── README.md
│ │ │ ├── init_lib.cc
│ │ │ ├── min_ex-inl.h
│ │ │ ├── min_ex.cc
│ │ │ ├── min_ex.cu
│ │ │ └── test_loading.py
│ │ ├── lib_pass/
│ │ │ ├── Makefile
│ │ │ ├── README.md
│ │ │ ├── pass_lib.cc
│ │ │ └── test_pass.py
│ │ └── lib_subgraph/
│ │ ├── Makefile
│ │ ├── README.md
│ │ ├── subgraph_lib.cc
│ │ └── test_subgraph.py
│ ├── gluon/
│ │ ├── actor_critic/
│ │ │ ├── README.md
│ │ │ └── actor_critic.py
│ │ ├── data.py
│ │ ├── house_prices/
│ │ │ ├── README.md
│ │ │ └── kaggle_k_fold_cross_validation.py
│ │ ├── image_classification.py
│ │ ├── mnist/
│ │ │ ├── README.md
│ │ │ └── mnist.py
│ │ └── super_resolution/
│ │ ├── README.md
│ │ └── super_resolution.py
│ ├── multi-task/
│ │ ├── README.md
│ │ └── multi-task-learning.ipynb
│ ├── probability/
│ │ └── VAE/
│ │ └── VAE.md
│ ├── profiler/
│ │ ├── README.md
│ │ ├── profiler_imageiter.py
│ │ ├── profiler_matmul.py
│ │ └── profiler_ndarray.py
│ ├── quantization/
│ │ ├── README.md
│ │ ├── imagenet_gen_qsym_onednn.py
│ │ ├── imagenet_inference.py
│ │ └── launch_inference_onednn.sh
│ ├── quantization_inc/
│ │ ├── custom_strategy.py
│ │ ├── resnet50v2_mse.yaml
│ │ ├── resnet_measurement.py
│ │ ├── resnet_mse.py
│ │ └── resnet_tuning.py
│ └── recommenders/
│ ├── .gitignore
│ ├── README.md
│ ├── demo1-MF.ipynb
│ ├── demo2-dssm.ipynb
│ ├── matrix_fact.py
│ └── movielens_data.py
├── include/
│ └── mxnet/
│ ├── api_registry.h
│ ├── base.h
│ ├── c_api.h
│ ├── c_api_error.h
│ ├── c_api_test.h
│ ├── engine.h
│ ├── executor.h
│ ├── expr_operator.h
│ ├── graph_attr_types.h
│ ├── imperative.h
│ ├── io.h
│ ├── ir/
│ │ └── expr.h
│ ├── kvstore.h
│ ├── lib_api.h
│ ├── libinfo.h
│ ├── ndarray.h
│ ├── node/
│ │ ├── container.h
│ │ └── node.h
│ ├── op_attr_types.h
│ ├── operator.h
│ ├── operator_util.h
│ ├── random_generator.h
│ ├── resource.h
│ ├── rtc.h
│ ├── runtime/
│ │ ├── c_runtime_api.h
│ │ ├── container.h
│ │ ├── container_ext.h
│ │ ├── data_type.h
│ │ ├── ffi_helper.h
│ │ ├── memory.h
│ │ ├── ndarray.h
│ │ ├── ndarray_handle.h
│ │ ├── object.h
│ │ ├── packed_func.h
│ │ ├── py_arg.h
│ │ └── registry.h
│ ├── storage.h
│ ├── tensor_blob.h
│ └── tuple.h
├── licenses/
│ ├── BOOST1_0
│ ├── BSD2
│ ├── BSD3-cmake
│ ├── MIT
│ └── OFL1_1
├── plugin/
│ ├── opencv/
│ │ ├── __init__.py
│ │ ├── cv_api.cc
│ │ ├── cv_api.h
│ │ ├── opencv.mk
│ │ └── opencv.py
│ ├── sframe/
│ │ ├── iter_sframe.cc
│ │ └── plugin.mk
│ ├── torch/
│ │ ├── torch.mk
│ │ ├── torch_base.cc
│ │ ├── torch_base.h
│ │ ├── torch_criterion-inl.h
│ │ ├── torch_criterion.cc
│ │ ├── torch_criterion.cu
│ │ ├── torch_function.cc
│ │ ├── torch_function.h
│ │ ├── torch_module-inl.h
│ │ ├── torch_module.cc
│ │ └── torch_module.cu
│ └── warpctc/
│ ├── warpctc-inl.h
│ ├── warpctc.cc
│ ├── warpctc.cu
│ └── warpctc.mk
├── prospector.yaml
├── pytest.ini
├── python/
│ ├── .gitignore
│ ├── README.md
│ ├── mxnet/
│ │ ├── __init__.py
│ │ ├── _api_internal.py
│ │ ├── _ctypes/
│ │ │ ├── __init__.py
│ │ │ ├── _api_internal.py
│ │ │ ├── cached_op.py
│ │ │ ├── ndarray.py
│ │ │ ├── space.py
│ │ │ └── symbol.py
│ │ ├── _cy3/
│ │ │ ├── README.md
│ │ │ └── __init__.py
│ │ ├── _deferred_compute.py
│ │ ├── _ffi/
│ │ │ ├── __init__.py
│ │ │ ├── _ctypes/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── function.py
│ │ │ │ ├── object.py
│ │ │ │ └── types.py
│ │ │ ├── _cy3/
│ │ │ │ └── __init__.py
│ │ │ ├── _cython/
│ │ │ │ ├── base.pxi
│ │ │ │ ├── core.pyx
│ │ │ │ ├── function.pxi
│ │ │ │ ├── ndarray.pxi
│ │ │ │ └── object.pxi
│ │ │ ├── base.py
│ │ │ ├── function.py
│ │ │ ├── node_generic.py
│ │ │ ├── object.py
│ │ │ └── runtime_ctypes.py
│ │ ├── _global_var.py
│ │ ├── _numpy_op_doc.py
│ │ ├── amp/
│ │ │ ├── __init__.py
│ │ │ ├── amp.py
│ │ │ ├── lists/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── symbol_bf16.py
│ │ │ │ └── symbol_fp16.py
│ │ │ └── loss_scaler.py
│ │ ├── api.py
│ │ ├── attribute.py
│ │ ├── autograd.py
│ │ ├── base.py
│ │ ├── callback.py
│ │ ├── container.py
│ │ ├── context.py
│ │ ├── contrib/
│ │ │ ├── __init__.py
│ │ │ ├── io.py
│ │ │ ├── ndarray.py
│ │ │ ├── onnx/
│ │ │ │ └── __init__.py
│ │ │ ├── quantization.py
│ │ │ ├── symbol.py
│ │ │ ├── tensorboard.py
│ │ │ ├── tensorrt.py
│ │ │ └── text/
│ │ │ ├── __init__.py
│ │ │ ├── _constants.py
│ │ │ ├── embedding.py
│ │ │ ├── utils.py
│ │ │ └── vocab.py
│ │ ├── cuda/
│ │ │ ├── __init__.py
│ │ │ └── nvtx.py
│ │ ├── cython/
│ │ │ ├── __init__.py
│ │ │ ├── base.pyi
│ │ │ ├── ndarray.pyx
│ │ │ └── symbol.pyx
│ │ ├── device.py
│ │ ├── dlpack.py
│ │ ├── engine.py
│ │ ├── error.py
│ │ ├── executor.py
│ │ ├── gluon/
│ │ │ ├── .gitignore
│ │ │ ├── __init__.py
│ │ │ ├── block.py
│ │ │ ├── contrib/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── data/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── _constants.py
│ │ │ │ │ └── vision/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── dataloader.py
│ │ │ │ │ └── transforms/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ └── bbox/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── bbox.py
│ │ │ │ │ └── utils.py
│ │ │ │ └── estimator/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── batch_processor.py
│ │ │ │ ├── estimator.py
│ │ │ │ ├── event_handler.py
│ │ │ │ └── utils.py
│ │ │ ├── data/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── _internal.py
│ │ │ │ ├── batchify.py
│ │ │ │ ├── dataloader.py
│ │ │ │ ├── dataset.py
│ │ │ │ ├── sampler.py
│ │ │ │ └── vision/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── datasets.py
│ │ │ │ └── transforms/
│ │ │ │ ├── __init__.py
│ │ │ │ └── image.py
│ │ │ ├── loss.py
│ │ │ ├── metric.py
│ │ │ ├── model_zoo/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── model_store.py
│ │ │ │ └── vision/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── alexnet.py
│ │ │ │ ├── densenet.py
│ │ │ │ ├── inception.py
│ │ │ │ ├── mobilenet.py
│ │ │ │ ├── resnet.py
│ │ │ │ ├── squeezenet.py
│ │ │ │ └── vgg.py
│ │ │ ├── nn/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── activations.py
│ │ │ │ ├── basic_layers.py
│ │ │ │ └── conv_layers.py
│ │ │ ├── parameter.py
│ │ │ ├── probability/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── block/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ └── stochastic_block.py
│ │ │ │ ├── distributions/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── bernoulli.py
│ │ │ │ │ ├── beta.py
│ │ │ │ │ ├── binomial.py
│ │ │ │ │ ├── categorical.py
│ │ │ │ │ ├── cauchy.py
│ │ │ │ │ ├── chi2.py
│ │ │ │ │ ├── constraint.py
│ │ │ │ │ ├── dirichlet.py
│ │ │ │ │ ├── distribution.py
│ │ │ │ │ ├── divergence.py
│ │ │ │ │ ├── exp_family.py
│ │ │ │ │ ├── exponential.py
│ │ │ │ │ ├── fishersnedecor.py
│ │ │ │ │ ├── gamma.py
│ │ │ │ │ ├── geometric.py
│ │ │ │ │ ├── gumbel.py
│ │ │ │ │ ├── half_cauchy.py
│ │ │ │ │ ├── half_normal.py
│ │ │ │ │ ├── independent.py
│ │ │ │ │ ├── laplace.py
│ │ │ │ │ ├── multinomial.py
│ │ │ │ │ ├── multivariate_normal.py
│ │ │ │ │ ├── negative_binomial.py
│ │ │ │ │ ├── normal.py
│ │ │ │ │ ├── one_hot_categorical.py
│ │ │ │ │ ├── pareto.py
│ │ │ │ │ ├── poisson.py
│ │ │ │ │ ├── relaxed_bernoulli.py
│ │ │ │ │ ├── relaxed_one_hot_categorical.py
│ │ │ │ │ ├── studentT.py
│ │ │ │ │ ├── transformed_distribution.py
│ │ │ │ │ ├── uniform.py
│ │ │ │ │ ├── utils.py
│ │ │ │ │ └── weibull.py
│ │ │ │ └── transformation/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── domain_map.py
│ │ │ │ └── transformation.py
│ │ │ ├── rnn/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── conv_rnn_cell.py
│ │ │ │ ├── rnn_cell.py
│ │ │ │ └── rnn_layer.py
│ │ │ ├── trainer.py
│ │ │ └── utils.py
│ │ ├── image/
│ │ │ ├── __init__.py
│ │ │ ├── detection.py
│ │ │ └── image.py
│ │ ├── initializer.py
│ │ ├── io/
│ │ │ ├── __init__.py
│ │ │ ├── io.py
│ │ │ └── utils.py
│ │ ├── kvstore/
│ │ │ ├── __init__.py
│ │ │ ├── base.py
│ │ │ ├── byteps.py
│ │ │ ├── horovod.py
│ │ │ ├── kvstore.py
│ │ │ └── kvstore_server.py
│ │ ├── libinfo.py
│ │ ├── library.py
│ │ ├── log.py
│ │ ├── lr_scheduler.py
│ │ ├── misc.py
│ │ ├── model.py
│ │ ├── name.py
│ │ ├── ndarray/
│ │ │ ├── __init__.py
│ │ │ ├── _internal.py
│ │ │ ├── contrib.py
│ │ │ ├── image.py
│ │ │ ├── linalg.py
│ │ │ ├── ndarray.py
│ │ │ ├── numpy/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── _api_internal.py
│ │ │ │ ├── _internal.py
│ │ │ │ ├── _op.py
│ │ │ │ ├── _register.py
│ │ │ │ ├── linalg.py
│ │ │ │ └── random.py
│ │ │ ├── numpy_extension/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── _api_internal.py
│ │ │ │ ├── _op.py
│ │ │ │ ├── _register.py
│ │ │ │ ├── control_flow.py
│ │ │ │ ├── image.py
│ │ │ │ └── random.py
│ │ │ ├── op.py
│ │ │ ├── random.py
│ │ │ ├── register.py
│ │ │ ├── sparse.py
│ │ │ └── utils.py
│ │ ├── ndarray_doc.py
│ │ ├── notebook/
│ │ │ ├── __init__.py
│ │ │ └── callback.py
│ │ ├── numpy/
│ │ │ ├── __init__.py
│ │ │ ├── _op.py
│ │ │ ├── _register.py
│ │ │ ├── arrayprint.py
│ │ │ ├── fallback.py
│ │ │ ├── fallback_linalg.py
│ │ │ ├── function_base.py
│ │ │ ├── io.py
│ │ │ ├── linalg.py
│ │ │ ├── multiarray.py
│ │ │ ├── random.py
│ │ │ ├── set_functions.py
│ │ │ ├── stride_tricks.py
│ │ │ ├── type_functions.py
│ │ │ └── utils.py
│ │ ├── numpy_dispatch_protocol.py
│ │ ├── numpy_extension/
│ │ │ ├── __init__.py
│ │ │ ├── _op.py
│ │ │ ├── _register.py
│ │ │ ├── control_flow.py
│ │ │ ├── image.py
│ │ │ ├── random.py
│ │ │ └── utils.py
│ │ ├── numpy_op_fallback.py
│ │ ├── numpy_op_signature.py
│ │ ├── onnx/
│ │ │ ├── README.md
│ │ │ ├── __init__.py
│ │ │ ├── mx2onnx/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── _export_helper.py
│ │ │ │ ├── _export_model.py
│ │ │ │ ├── _export_onnx.py
│ │ │ │ └── _op_translations/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── _op_translations_opset12.py
│ │ │ │ └── _op_translations_opset13.py
│ │ │ └── setup.py
│ │ ├── operator.py
│ │ ├── optimizer/
│ │ │ ├── __init__.py
│ │ │ ├── adabelief.py
│ │ │ ├── adadelta.py
│ │ │ ├── adagrad.py
│ │ │ ├── adam.py
│ │ │ ├── adamW.py
│ │ │ ├── adamax.py
│ │ │ ├── contrib.py
│ │ │ ├── dcasgd.py
│ │ │ ├── ftml.py
│ │ │ ├── ftrl.py
│ │ │ ├── lamb.py
│ │ │ ├── lans.py
│ │ │ ├── lars.py
│ │ │ ├── nadam.py
│ │ │ ├── nag.py
│ │ │ ├── optimizer.py
│ │ │ ├── rmsprop.py
│ │ │ ├── sgd.py
│ │ │ ├── sgld.py
│ │ │ ├── signum.py
│ │ │ ├── updater.py
│ │ │ └── utils.py
│ │ ├── profiler.py
│ │ ├── random.py
│ │ ├── recordio.py
│ │ ├── registry.py
│ │ ├── rtc.py
│ │ ├── runtime.py
│ │ ├── symbol/
│ │ │ ├── __init__.py
│ │ │ ├── _internal.py
│ │ │ ├── contrib.py
│ │ │ ├── image.py
│ │ │ ├── linalg.py
│ │ │ ├── numpy/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── _internal.py
│ │ │ │ ├── _op.py
│ │ │ │ ├── _register.py
│ │ │ │ ├── _symbol.py
│ │ │ │ ├── linalg.py
│ │ │ │ └── random.py
│ │ │ ├── numpy_extension/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── _op.py
│ │ │ │ ├── _register.py
│ │ │ │ ├── image.py
│ │ │ │ └── random.py
│ │ │ ├── op.py
│ │ │ ├── random.py
│ │ │ ├── register.py
│ │ │ ├── sparse.py
│ │ │ └── symbol.py
│ │ ├── symbol_doc.py
│ │ ├── test_utils.py
│ │ ├── tvmop.py
│ │ ├── util.py
│ │ └── visualization.py
│ └── setup.py
├── rat-excludes
├── readthedocs.yml
├── snap.python
├── src/
│ ├── api/
│ │ ├── _api_internal/
│ │ │ └── _api_internal.cc
│ │ ├── cached_op_api.cc
│ │ └── operator/
│ │ ├── numpy/
│ │ │ ├── linalg/
│ │ │ │ ├── np_det.cc
│ │ │ │ ├── np_eig.cc
│ │ │ │ ├── np_eigvals.cc
│ │ │ │ ├── np_gesvd.cc
│ │ │ │ ├── np_inv.cc
│ │ │ │ ├── np_lstsq.cc
│ │ │ │ ├── np_matrix_rank.cc
│ │ │ │ ├── np_norm.cc
│ │ │ │ ├── np_pinv.cc
│ │ │ │ ├── np_potrf.cc
│ │ │ │ ├── np_qr.cc
│ │ │ │ ├── np_slogdet.cc
│ │ │ │ ├── np_solve.cc
│ │ │ │ ├── np_tensorinv.cc
│ │ │ │ └── np_tensorsolve.cc
│ │ │ ├── np_bincount_op.cc
│ │ │ ├── np_broadcast_reduce_op_boolean.cc
│ │ │ ├── np_broadcast_reduce_op_index.cc
│ │ │ ├── np_broadcast_reduce_op_value.cc
│ │ │ ├── np_cross.cc
│ │ │ ├── np_cumsum.cc
│ │ │ ├── np_delete_op.cc
│ │ │ ├── np_diff_op.cc
│ │ │ ├── np_dot_op.cc
│ │ │ ├── np_ediff1d_op.cc
│ │ │ ├── np_einsum_op.cc
│ │ │ ├── np_elemwise_broadcast_logic_op.cc
│ │ │ ├── np_elemwise_broadcast_op.cc
│ │ │ ├── np_elemwise_broadcast_op_extended_sec.cc
│ │ │ ├── np_elemwise_unary_op_basic.cc
│ │ │ ├── np_fill_diagonal_op.cc
│ │ │ ├── np_histogram_op.cc
│ │ │ ├── np_init_op.cc
│ │ │ ├── np_insert_op.cc
│ │ │ ├── np_interp_op.cc
│ │ │ ├── np_kron.cc
│ │ │ ├── np_matmul_op.cc
│ │ │ ├── np_matrix_op.cc
│ │ │ ├── np_memory_op.cc
│ │ │ ├── np_moments_op.cc
│ │ │ ├── np_nan_to_num_op.cc
│ │ │ ├── np_nonzero_op.cc
│ │ │ ├── np_ordering_op.cc
│ │ │ ├── np_pad_op.cc
│ │ │ ├── np_percentile_op.cc
│ │ │ ├── np_polynomial_op.cc
│ │ │ ├── np_repeat_op.cc
│ │ │ ├── np_tensordot_op.cc
│ │ │ ├── np_trace_op.cc
│ │ │ ├── np_tri_op.cc
│ │ │ ├── np_tril_op.cc
│ │ │ ├── np_triu_op.cc
│ │ │ ├── np_unique_op.cc
│ │ │ ├── np_where_op.cc
│ │ │ ├── np_window_op.cc
│ │ │ └── random/
│ │ │ ├── np_choice_op.cc
│ │ │ ├── np_exponential_op.cc
│ │ │ ├── np_laplace_op.cc
│ │ │ ├── np_location_scale_op.cc
│ │ │ ├── np_multinomial_op.cc
│ │ │ ├── np_pareto_op.cc
│ │ │ ├── np_power_op.cc
│ │ │ ├── np_rayleigh_op.cc
│ │ │ └── np_weibull_op.cc
│ │ ├── numpy_extension/
│ │ │ ├── npx_activation_op.cc
│ │ │ ├── npx_arange_like_op.cc
│ │ │ ├── npx_batch_dot_op.cc
│ │ │ ├── npx_batch_norm_op.cc
│ │ │ ├── npx_broadcast_like_op.cc
│ │ │ ├── npx_control_flow_op.cc
│ │ │ ├── npx_convolution_op.cc
│ │ │ ├── npx_deconvolution_op.cc
│ │ │ ├── npx_dropout_op.cc
│ │ │ ├── npx_embedding_op.cc
│ │ │ ├── npx_fully_connected_op.cc
│ │ │ ├── npx_group_norm_op.cc
│ │ │ ├── npx_layer_norm_op.cc
│ │ │ ├── npx_leaky_relu_op.cc
│ │ │ ├── npx_one_hot_op.cc
│ │ │ ├── npx_pick_op.cc
│ │ │ ├── npx_pooling_op.cc
│ │ │ ├── npx_rnn_op.cc
│ │ │ ├── npx_softmax_op.cc
│ │ │ └── npx_topk_op.cc
│ │ ├── op_utils.cc
│ │ ├── op_utils.h
│ │ ├── random/
│ │ │ ├── np_gamma_op.cc
│ │ │ ├── np_normal_op.cc
│ │ │ ├── np_randint_op.cc
│ │ │ ├── np_uniform_op.cc
│ │ │ └── shuffle_op.cc
│ │ ├── tensor/
│ │ │ ├── elemwise_binary_broadcast_op_extended.cc
│ │ │ ├── indexing_op.cc
│ │ │ ├── matrix_op.cc
│ │ │ └── unravel.cc
│ │ ├── ufunc_helper.cc
│ │ ├── ufunc_helper.h
│ │ ├── utils.cc
│ │ └── utils.h
│ ├── base.cc
│ ├── c_api/
│ │ ├── .clang-tidy
│ │ ├── c_api.cc
│ │ ├── c_api_common.h
│ │ ├── c_api_function.cc
│ │ ├── c_api_ndarray.cc
│ │ ├── c_api_profile.cc
│ │ ├── c_api_symbolic.cc
│ │ └── c_api_test.cc
│ ├── common/
│ │ ├── alm.cc
│ │ ├── alm.h
│ │ ├── cuda/
│ │ │ ├── cudnn_cxx.cc
│ │ │ ├── cudnn_cxx.h
│ │ │ ├── nvtx.h
│ │ │ ├── rtc/
│ │ │ │ ├── backward_functions-inl.h
│ │ │ │ ├── forward_functions-inl.h
│ │ │ │ ├── half-inl.h
│ │ │ │ ├── reducer-inl.h
│ │ │ │ ├── special_functions-inl.h
│ │ │ │ ├── util-inl.h
│ │ │ │ └── vectorization-inl.h
│ │ │ ├── rtc.cc
│ │ │ ├── rtc.h
│ │ │ ├── utils.cc
│ │ │ └── utils.h
│ │ ├── exec_utils.cc
│ │ ├── exec_utils.h
│ │ ├── lazy_alloc_array.h
│ │ ├── object_pool.h
│ │ ├── random_generator.cu
│ │ ├── rtc.cc
│ │ ├── static_array.h
│ │ ├── tensor_inspector.h
│ │ ├── utils.cc
│ │ ├── utils.cu
│ │ └── utils.h
│ ├── engine/
│ │ ├── engine.cc
│ │ ├── engine_impl.h
│ │ ├── naive_engine.cc
│ │ ├── openmp.cc
│ │ ├── openmp.h
│ │ ├── stream_manager.h
│ │ ├── thread_pool.h
│ │ ├── threaded_engine.cc
│ │ ├── threaded_engine.h
│ │ ├── threaded_engine_perdevice.cc
│ │ └── threaded_engine_pooled.cc
│ ├── imperative/
│ │ ├── attach_op_execs_pass.cc
│ │ ├── attach_op_resource_pass.cc
│ │ ├── cached_op.cc
│ │ ├── cached_op.h
│ │ ├── cached_op_threadsafe.cc
│ │ ├── cached_op_threadsafe.h
│ │ ├── cuda_graphs.h
│ │ ├── eliminate_common_expr_pass.cc
│ │ ├── exec_pass.h
│ │ ├── imperative.cc
│ │ ├── imperative_utils.cc
│ │ ├── imperative_utils.h
│ │ ├── infer_graph_attr_pass.cc
│ │ ├── inplace_addto_detect_pass.cc
│ │ ├── naive_cached_op.cc
│ │ ├── naive_cached_op.h
│ │ ├── pointwise_fusion_pass.cc
│ │ ├── simple_partition_pass.cc
│ │ └── simple_partition_pass.h
│ ├── initialize.cc
│ ├── initialize.h
│ ├── io/
│ │ ├── batchify.cc
│ │ ├── dataloader.cc
│ │ ├── dataset.cc
│ │ ├── image_aug_default.cc
│ │ ├── image_augmenter.h
│ │ ├── image_det_aug_default.cc
│ │ ├── image_io.cc
│ │ ├── image_iter_common.h
│ │ ├── image_recordio.h
│ │ ├── inst_vector.h
│ │ ├── io.cc
│ │ ├── iter_batchloader.h
│ │ ├── iter_csv.cc
│ │ ├── iter_image_det_recordio.cc
│ │ ├── iter_image_recordio.cc
│ │ ├── iter_image_recordio_2.cc
│ │ ├── iter_libsvm.cc
│ │ ├── iter_mnist.cc
│ │ ├── iter_normalize.h
│ │ ├── iter_prefetcher.h
│ │ ├── iter_sampler.cc
│ │ ├── iter_sparse.h
│ │ ├── iter_sparse_batchloader.h
│ │ ├── iter_sparse_prefetcher.h
│ │ └── opencv_compatibility.h
│ ├── ir/
│ │ └── expr.cc
│ ├── kvstore/
│ │ ├── comm.h
│ │ ├── comm_tree.h
│ │ ├── gpu_topology.h
│ │ ├── gradient_compression-inl.h
│ │ ├── gradient_compression.cc
│ │ ├── gradient_compression.cu
│ │ ├── gradient_compression.h
│ │ ├── kvstore.cc
│ │ ├── kvstore_dist.h
│ │ ├── kvstore_dist_server.h
│ │ ├── kvstore_local.h
│ │ ├── kvstore_nccl.h
│ │ ├── kvstore_utils.cc
│ │ ├── kvstore_utils.cu
│ │ ├── kvstore_utils.h
│ │ └── p3store_dist.h
│ ├── lang/
│ │ ├── expr.cc
│ │ └── ir.cc
│ ├── lib_api.cc
│ ├── libinfo.cc
│ ├── ndarray/
│ │ ├── ndarray.cc
│ │ ├── ndarray_function-inl.cuh
│ │ ├── ndarray_function-inl.h
│ │ ├── ndarray_function.cc
│ │ ├── ndarray_function.cu
│ │ └── ndarray_function.h
│ ├── nnvm/
│ │ ├── error.h
│ │ ├── gradient.cc
│ │ ├── graph_algorithm.h
│ │ ├── graph_editor.cc
│ │ ├── legacy_json_util.cc
│ │ ├── legacy_op_util.cc
│ │ ├── low_precision_pass.cc
│ │ ├── node_op_util.h
│ │ ├── plan_memory.cc
│ │ └── tvm_bridge.cc
│ ├── operator/
│ │ ├── all_finite-inl.h
│ │ ├── all_finite.cc
│ │ ├── all_finite.cu
│ │ ├── amp_graph_pass.cc
│ │ ├── bilinear_sampler-inl.h
│ │ ├── bilinear_sampler.cc
│ │ ├── bilinear_sampler.cu
│ │ ├── c_lapack_api.cc
│ │ ├── c_lapack_api.h
│ │ ├── channel_op_common.h
│ │ ├── contrib/
│ │ │ ├── adabelief-inl.h
│ │ │ ├── adabelief.cc
│ │ │ ├── adabelief.cu
│ │ │ ├── adamw-inl.h
│ │ │ ├── adamw.cc
│ │ │ ├── adamw.cu
│ │ │ ├── adaptive_avg_pooling-inl.h
│ │ │ ├── adaptive_avg_pooling.cc
│ │ │ ├── adaptive_avg_pooling.cu
│ │ │ ├── allclose_op-inl.h
│ │ │ ├── allclose_op.cc
│ │ │ ├── allclose_op.cu
│ │ │ ├── bilinear_resize-inl.cuh
│ │ │ ├── bilinear_resize-inl.h
│ │ │ ├── bilinear_resize.cc
│ │ │ ├── bilinear_resize.cu
│ │ │ ├── boolean_mask-inl.h
│ │ │ ├── boolean_mask.cc
│ │ │ ├── boolean_mask.cu
│ │ │ ├── bounding_box-common.h
│ │ │ ├── bounding_box-inl.cuh
│ │ │ ├── bounding_box-inl.h
│ │ │ ├── bounding_box.cc
│ │ │ ├── bounding_box.cu
│ │ │ ├── count_sketch-inl.h
│ │ │ ├── count_sketch.cc
│ │ │ ├── count_sketch.cu
│ │ │ ├── deformable_psroi_pooling-inl.h
│ │ │ ├── deformable_psroi_pooling.cc
│ │ │ ├── deformable_psroi_pooling.cu
│ │ │ ├── dgl_graph-inl.h
│ │ │ ├── dgl_graph.cc
│ │ │ ├── dgl_graph.cu
│ │ │ ├── dynamic_shape_ops-inl.h
│ │ │ ├── dynamic_shape_ops.cc
│ │ │ ├── erfinv-inl.h
│ │ │ ├── fft-inl.h
│ │ │ ├── fft.cc
│ │ │ ├── fft.cu
│ │ │ ├── gradient_multiplier_op.cc
│ │ │ ├── gradient_multiplier_op.cu
│ │ │ ├── hawkes_ll-inl.h
│ │ │ ├── hawkes_ll.cc
│ │ │ ├── hawkes_ll.cu
│ │ │ ├── index_array-inl.h
│ │ │ ├── index_array.cc
│ │ │ ├── index_array.cu
│ │ │ ├── index_copy-inl.h
│ │ │ ├── index_copy.cc
│ │ │ ├── index_copy.cu
│ │ │ ├── intgemm/
│ │ │ │ ├── intgemm_fully_connected_op.cc
│ │ │ │ ├── max_absolute_op.cc
│ │ │ │ ├── prepare_data_op.cc
│ │ │ │ ├── prepare_weight_op.cc
│ │ │ │ └── take_weight_op.cc
│ │ │ ├── krprod.cc
│ │ │ ├── krprod.h
│ │ │ ├── mrcnn_mask_target-inl.h
│ │ │ ├── mrcnn_mask_target.cu
│ │ │ ├── multi_lamb-inl.h
│ │ │ ├── multi_lamb.cc
│ │ │ ├── multi_lamb.cu
│ │ │ ├── multi_lans-inl.h
│ │ │ ├── multi_lans.cc
│ │ │ ├── multi_lans.cu
│ │ │ ├── multi_lars-inl.h
│ │ │ ├── multi_lars.cc
│ │ │ ├── multi_lars.cu
│ │ │ ├── multi_proposal-inl.h
│ │ │ ├── multi_proposal.cc
│ │ │ ├── multi_proposal.cu
│ │ │ ├── multi_sum_sq-inl.h
│ │ │ ├── multi_sum_sq.cc
│ │ │ ├── multi_sum_sq.cu
│ │ │ ├── multibox_detection-inl.h
│ │ │ ├── multibox_detection.cc
│ │ │ ├── multibox_detection.cu
│ │ │ ├── multibox_prior-inl.h
│ │ │ ├── multibox_prior.cc
│ │ │ ├── multibox_prior.cu
│ │ │ ├── multibox_target-inl.h
│ │ │ ├── multibox_target.cc
│ │ │ ├── multibox_target.cu
│ │ │ ├── nn/
│ │ │ │ ├── deformable_im2col.cuh
│ │ │ │ ├── deformable_im2col.h
│ │ │ │ ├── modulated_deformable_im2col.cuh
│ │ │ │ └── modulated_deformable_im2col.h
│ │ │ ├── nnz.cc
│ │ │ ├── optimizer_op-inl.h
│ │ │ ├── optimizer_op.cc
│ │ │ ├── optimizer_op.cu
│ │ │ ├── preloaded_multi_sgd-inl.h
│ │ │ ├── preloaded_multi_sgd.cc
│ │ │ ├── preloaded_multi_sgd.cu
│ │ │ ├── proposal-inl.h
│ │ │ ├── proposal.cc
│ │ │ ├── proposal.cu
│ │ │ ├── psroi_pooling-inl.h
│ │ │ ├── psroi_pooling.cc
│ │ │ ├── psroi_pooling.cu
│ │ │ ├── quadratic_op-inl.h
│ │ │ ├── quadratic_op.cc
│ │ │ ├── quadratic_op.cu
│ │ │ ├── reset_arrays-inl.h
│ │ │ ├── reset_arrays.cc
│ │ │ ├── reset_arrays.cu
│ │ │ ├── roi_align-inl.h
│ │ │ ├── roi_align.cc
│ │ │ ├── roi_align.cu
│ │ │ ├── rroi_align-inl.h
│ │ │ ├── rroi_align.cc
│ │ │ ├── stes_op.cc
│ │ │ ├── stes_op.cu
│ │ │ ├── stes_op.h
│ │ │ ├── sync_batch_norm-inl.h
│ │ │ ├── sync_batch_norm.cc
│ │ │ ├── sync_batch_norm.cu
│ │ │ ├── transformer-inl.h
│ │ │ ├── transformer.cc
│ │ │ ├── transformer.cu
│ │ │ └── tvmop/
│ │ │ ├── dot.cc
│ │ │ └── ufunc.cc
│ │ ├── control_flow.cc
│ │ ├── correlation-inl.h
│ │ ├── correlation.cc
│ │ ├── correlation.cu
│ │ ├── crop-inl.h
│ │ ├── crop.cc
│ │ ├── crop.cu
│ │ ├── cross_device_copy.cc
│ │ ├── cudnn_bilinear_sampler-inl.h
│ │ ├── cudnn_lrn-inl.h
│ │ ├── cudnn_ops.cc
│ │ ├── cudnn_ops.h
│ │ ├── cudnn_spatial_transformer-inl.h
│ │ ├── custom/
│ │ │ ├── custom-inl.h
│ │ │ ├── custom.cc
│ │ │ ├── native_op-inl.h
│ │ │ ├── native_op.cc
│ │ │ ├── native_op.cu
│ │ │ ├── ndarray_op-inl.h
│ │ │ └── ndarray_op.cc
│ │ ├── deformable_convolution-inl.h
│ │ ├── deformable_convolution.cc
│ │ ├── deformable_convolution.cu
│ │ ├── elemwise_op_common.h
│ │ ├── fusion/
│ │ │ ├── fused_op-inl.h
│ │ │ ├── fused_op.cc
│ │ │ ├── fused_op.cu
│ │ │ └── fused_op.h
│ │ ├── grid_generator-inl.h
│ │ ├── grid_generator.cc
│ │ ├── grid_generator.cu
│ │ ├── identity_attach_KL_sparse_reg-inl.h
│ │ ├── identity_attach_KL_sparse_reg.cc
│ │ ├── identity_attach_KL_sparse_reg.cu
│ │ ├── image/
│ │ │ ├── crop-inl.h
│ │ │ ├── crop.cc
│ │ │ ├── crop.cu
│ │ │ ├── image_random-inl.h
│ │ │ ├── image_random.cc
│ │ │ ├── image_random.cu
│ │ │ ├── image_utils.h
│ │ │ ├── resize-inl.h
│ │ │ ├── resize.cc
│ │ │ └── resize.cu
│ │ ├── instance_norm-inl.h
│ │ ├── instance_norm.cc
│ │ ├── instance_norm.cu
│ │ ├── l2_normalization-inl.h
│ │ ├── l2_normalization.cc
│ │ ├── l2_normalization.cu
│ │ ├── leaky_relu-inl.h
│ │ ├── leaky_relu.cc
│ │ ├── leaky_relu.cu
│ │ ├── linalg.h
│ │ ├── linalg_impl.h
│ │ ├── loss_binary_op-inl.h
│ │ ├── loss_binary_op.cc
│ │ ├── loss_binary_op.cu
│ │ ├── make_loss-inl.h
│ │ ├── make_loss.cc
│ │ ├── make_loss.cu
│ │ ├── math_functions-inl.h
│ │ ├── mkl_functions-inl.h
│ │ ├── modulated_deformable_convolution-inl.h
│ │ ├── modulated_deformable_convolution.cc
│ │ ├── modulated_deformable_convolution.cu
│ │ ├── mshadow_op.h
│ │ ├── mxnet_op.h
│ │ ├── nn/
│ │ │ ├── activation-inl.h
│ │ │ ├── activation.cc
│ │ │ ├── activation.cu
│ │ │ ├── batch_norm-inl.h
│ │ │ ├── batch_norm.cc
│ │ │ ├── batch_norm.cu
│ │ │ ├── concat-inl.h
│ │ │ ├── concat.cc
│ │ │ ├── concat.cu
│ │ │ ├── convolution-inl.h
│ │ │ ├── convolution.cc
│ │ │ ├── convolution.cu
│ │ │ ├── ctc_loss-inl.h
│ │ │ ├── ctc_loss.cc
│ │ │ ├── ctc_loss.cu
│ │ │ ├── cudnn/
│ │ │ │ ├── cudnn_activation-inl.h
│ │ │ │ ├── cudnn_algoreg-inl.h
│ │ │ │ ├── cudnn_algoreg.cc
│ │ │ │ ├── cudnn_batch_norm.cu
│ │ │ │ ├── cudnn_batch_norm.h
│ │ │ │ ├── cudnn_convolution-inl.h
│ │ │ │ ├── cudnn_deconvolution-inl.h
│ │ │ │ ├── cudnn_pooling-inl.h
│ │ │ │ └── cudnn_softmax_activation-inl.h
│ │ │ ├── deconvolution-inl.h
│ │ │ ├── deconvolution.cc
│ │ │ ├── deconvolution.cu
│ │ │ ├── depthwise_convolution-inl.h
│ │ │ ├── depthwise_convolution_tf.cuh
│ │ │ ├── dnnl/
│ │ │ │ ├── dnnl_act-inl.h
│ │ │ │ ├── dnnl_act.cc
│ │ │ │ ├── dnnl_base-inl.h
│ │ │ │ ├── dnnl_base.cc
│ │ │ │ ├── dnnl_batch_dot-inl.h
│ │ │ │ ├── dnnl_batch_dot.cc
│ │ │ │ ├── dnnl_batch_norm-inl.h
│ │ │ │ ├── dnnl_batch_norm.cc
│ │ │ │ ├── dnnl_binary-inl.h
│ │ │ │ ├── dnnl_binary.cc
│ │ │ │ ├── dnnl_concat-inl.h
│ │ │ │ ├── dnnl_concat.cc
│ │ │ │ ├── dnnl_convolution-inl.h
│ │ │ │ ├── dnnl_convolution.cc
│ │ │ │ ├── dnnl_copy-inl.h
│ │ │ │ ├── dnnl_copy.cc
│ │ │ │ ├── dnnl_deconvolution-inl.h
│ │ │ │ ├── dnnl_deconvolution.cc
│ │ │ │ ├── dnnl_dot-inl.h
│ │ │ │ ├── dnnl_dot.cc
│ │ │ │ ├── dnnl_eltwise-inl.h
│ │ │ │ ├── dnnl_eltwise.cc
│ │ │ │ ├── dnnl_fully_connected-inl.h
│ │ │ │ ├── dnnl_fully_connected.cc
│ │ │ │ ├── dnnl_layer_norm-inl.h
│ │ │ │ ├── dnnl_layer_norm.cc
│ │ │ │ ├── dnnl_log_softmax.cc
│ │ │ │ ├── dnnl_lrn-inl.h
│ │ │ │ ├── dnnl_masked_softmax-inl.h
│ │ │ │ ├── dnnl_masked_softmax.cc
│ │ │ │ ├── dnnl_pooling-inl.h
│ │ │ │ ├── dnnl_pooling.cc
│ │ │ │ ├── dnnl_pow_mul_scalar-inl.h
│ │ │ │ ├── dnnl_pow_mul_scalar.cc
│ │ │ │ ├── dnnl_reduce-inl.h
│ │ │ │ ├── dnnl_reduce.cc
│ │ │ │ ├── dnnl_reshape-inl.h
│ │ │ │ ├── dnnl_reshape.cc
│ │ │ │ ├── dnnl_rnn-inl.h
│ │ │ │ ├── dnnl_rnn.cc
│ │ │ │ ├── dnnl_softmax-inl.h
│ │ │ │ ├── dnnl_softmax.cc
│ │ │ │ ├── dnnl_softmax_output-inl.h
│ │ │ │ ├── dnnl_softmax_output.cc
│ │ │ │ ├── dnnl_split-inl.h
│ │ │ │ ├── dnnl_split.cc
│ │ │ │ ├── dnnl_stack-inl.h
│ │ │ │ ├── dnnl_stack.cc
│ │ │ │ ├── dnnl_sum-inl.h
│ │ │ │ ├── dnnl_sum.cc
│ │ │ │ ├── dnnl_transpose-inl.h
│ │ │ │ ├── dnnl_transpose.cc
│ │ │ │ ├── dnnl_where-inl.h
│ │ │ │ └── dnnl_where.cc
│ │ │ ├── dropout-inl.h
│ │ │ ├── dropout.cc
│ │ │ ├── dropout.cu
│ │ │ ├── fully_connected-inl.h
│ │ │ ├── fully_connected.cc
│ │ │ ├── fully_connected.cu
│ │ │ ├── group_norm-inl.h
│ │ │ ├── group_norm.cc
│ │ │ ├── group_norm.cu
│ │ │ ├── im2col-inl.h
│ │ │ ├── im2col.cc
│ │ │ ├── im2col.cu
│ │ │ ├── im2col.cuh
│ │ │ ├── im2col.h
│ │ │ ├── layer_norm-inl.h
│ │ │ ├── layer_norm.cc
│ │ │ ├── layer_norm.cu
│ │ │ ├── layer_norm_cpu.h
│ │ │ ├── log_softmax.cc
│ │ │ ├── log_softmax.cu
│ │ │ ├── lrn-inl.h
│ │ │ ├── lrn.cc
│ │ │ ├── lrn.cu
│ │ │ ├── masked_softmax.cc
│ │ │ ├── moments-inl.h
│ │ │ ├── moments.cc
│ │ │ ├── moments.cu
│ │ │ ├── pool.cuh
│ │ │ ├── pool.h
│ │ │ ├── pool_utils.h
│ │ │ ├── pooling-inl.h
│ │ │ ├── pooling.cc
│ │ │ ├── pooling.cu
│ │ │ ├── sequence_mask-inl.h
│ │ │ ├── softmax-inl.h
│ │ │ ├── softmax.cc
│ │ │ ├── softmax.cu
│ │ │ ├── softmax_activation-inl.h
│ │ │ ├── softmax_activation.cc
│ │ │ ├── softmax_activation.cu
│ │ │ ├── softmin.cc
│ │ │ ├── softmin.cu
│ │ │ ├── upsampling-inl.h
│ │ │ ├── upsampling.cc
│ │ │ └── upsampling.cu
│ │ ├── npx_control_flow.cc
│ │ ├── npx_control_flow.h
│ │ ├── numpy/
│ │ │ ├── linalg/
│ │ │ │ ├── broadcast_reduce_customized-inl.h
│ │ │ │ ├── broadcast_reduce_op_customized.h
│ │ │ │ ├── np_eig-inl.h
│ │ │ │ ├── np_eig.cc
│ │ │ │ ├── np_eig.cu
│ │ │ │ ├── np_eigvals-inl.h
│ │ │ │ ├── np_eigvals.cc
│ │ │ │ ├── np_eigvals.cu
│ │ │ │ ├── np_gesvd-inl.h
│ │ │ │ ├── np_gesvd.cc
│ │ │ │ ├── np_gesvd.cu
│ │ │ │ ├── np_lstsq-inl.h
│ │ │ │ ├── np_lstsq.cc
│ │ │ │ ├── np_lstsq.cu
│ │ │ │ ├── np_matrix_rank-inl.h
│ │ │ │ ├── np_matrix_rank.cc
│ │ │ │ ├── np_matrix_rank.cu
│ │ │ │ ├── np_norm-inl.h
│ │ │ │ ├── np_norm.cc
│ │ │ │ ├── np_norm_backward.cc
│ │ │ │ ├── np_norm_backward.cu
│ │ │ │ ├── np_norm_forward.cc
│ │ │ │ ├── np_norm_forward.cu
│ │ │ │ ├── np_pinv-inl.h
│ │ │ │ ├── np_pinv.cc
│ │ │ │ ├── np_pinv.cu
│ │ │ │ ├── np_potrf-inl.h
│ │ │ │ ├── np_potrf.cc
│ │ │ │ ├── np_potrf.cu
│ │ │ │ ├── np_qr-inl.h
│ │ │ │ ├── np_qr.cc
│ │ │ │ ├── np_qr.cu
│ │ │ │ ├── np_solve-inl.h
│ │ │ │ ├── np_solve.cc
│ │ │ │ ├── np_solve.cu
│ │ │ │ ├── np_tensorinv-inl.h
│ │ │ │ ├── np_tensorinv.cc
│ │ │ │ ├── np_tensorinv.cu
│ │ │ │ ├── np_tensorsolve-inl.h
│ │ │ │ ├── np_tensorsolve.cc
│ │ │ │ └── np_tensorsolve.cu
│ │ │ ├── np_bincount_op-inl.h
│ │ │ ├── np_bincount_op.cc
│ │ │ ├── np_bincount_op.cu
│ │ │ ├── np_boolean_mask_assign.cc
│ │ │ ├── np_boolean_mask_assign.cu
│ │ │ ├── np_broadcast_reduce_op.cc
│ │ │ ├── np_broadcast_reduce_op.h
│ │ │ ├── np_broadcast_reduce_op_boolean.cc
│ │ │ ├── np_broadcast_reduce_op_boolean.cu
│ │ │ ├── np_broadcast_reduce_op_index.cc
│ │ │ ├── np_broadcast_reduce_op_index.cu
│ │ │ ├── np_broadcast_reduce_op_value.h
│ │ │ ├── np_broadcast_reduce_op_value_broadcast_to.cc
│ │ │ ├── np_broadcast_reduce_op_value_broadcast_to.cu
│ │ │ ├── np_broadcast_reduce_op_value_max.cc
│ │ │ ├── np_broadcast_reduce_op_value_max.cu
│ │ │ ├── np_broadcast_reduce_op_value_mean.cc
│ │ │ ├── np_broadcast_reduce_op_value_mean.cu
│ │ │ ├── np_broadcast_reduce_op_value_min.cc
│ │ │ ├── np_broadcast_reduce_op_value_min.cu
│ │ │ ├── np_broadcast_reduce_op_value_prod.cc
│ │ │ ├── np_broadcast_reduce_op_value_prod.cu
│ │ │ ├── np_broadcast_reduce_op_value_sum.cc
│ │ │ ├── np_broadcast_reduce_op_value_sum.cu
│ │ │ ├── np_constraint_check.cc
│ │ │ ├── np_constraint_check.cu
│ │ │ ├── np_constraint_check.h
│ │ │ ├── np_cross-inl.h
│ │ │ ├── np_cross.cc
│ │ │ ├── np_cross.cu
│ │ │ ├── np_cumsum-inl.h
│ │ │ ├── np_cumsum.cc
│ │ │ ├── np_cumsum.cu
│ │ │ ├── np_delete_op-inl.h
│ │ │ ├── np_delete_op.cc
│ │ │ ├── np_delete_op.cu
│ │ │ ├── np_diff-inl.h
│ │ │ ├── np_diff.cc
│ │ │ ├── np_diff.cu
│ │ │ ├── np_dot-inl.h
│ │ │ ├── np_dot_backward.cc
│ │ │ ├── np_dot_backward.cu
│ │ │ ├── np_dot_forward.cc
│ │ │ ├── np_dot_forward.cu
│ │ │ ├── np_ediff1d_op-inl.h
│ │ │ ├── np_ediff1d_op.cc
│ │ │ ├── np_ediff1d_op.cu
│ │ │ ├── np_einsum_op-inl.h
│ │ │ ├── np_einsum_op.cc
│ │ │ ├── np_einsum_op.cu
│ │ │ ├── np_einsum_path_op-inl.h
│ │ │ ├── np_elemwise_broadcast_logic_op.h
│ │ │ ├── np_elemwise_broadcast_logic_op_and.cc
│ │ │ ├── np_elemwise_broadcast_logic_op_and.cu
│ │ │ ├── np_elemwise_broadcast_logic_op_equal.cc
│ │ │ ├── np_elemwise_broadcast_logic_op_equal.cu
│ │ │ ├── np_elemwise_broadcast_logic_op_greater.cc
│ │ │ ├── np_elemwise_broadcast_logic_op_greater.cu
│ │ │ ├── np_elemwise_broadcast_logic_op_greater_equal.cc
│ │ │ ├── np_elemwise_broadcast_logic_op_greater_equal.cu
│ │ │ ├── np_elemwise_broadcast_logic_op_less.cc
│ │ │ ├── np_elemwise_broadcast_logic_op_less.cu
│ │ │ ├── np_elemwise_broadcast_logic_op_less_equal.cc
│ │ │ ├── np_elemwise_broadcast_logic_op_less_equal.cu
│ │ │ ├── np_elemwise_broadcast_logic_op_not_equal.cc
│ │ │ ├── np_elemwise_broadcast_logic_op_not_equal.cu
│ │ │ ├── np_elemwise_broadcast_logic_op_or.cc
│ │ │ ├── np_elemwise_broadcast_logic_op_or.cu
│ │ │ ├── np_elemwise_broadcast_logic_op_xor.cc
│ │ │ ├── np_elemwise_broadcast_logic_op_xor.cu
│ │ │ ├── np_elemwise_broadcast_op.h
│ │ │ ├── np_elemwise_broadcast_op_add.cc
│ │ │ ├── np_elemwise_broadcast_op_add.cu
│ │ │ ├── np_elemwise_broadcast_op_extended.cc
│ │ │ ├── np_elemwise_broadcast_op_extended.cu
│ │ │ ├── np_elemwise_broadcast_op_extended_sec.cc
│ │ │ ├── np_elemwise_broadcast_op_extended_sec.cu
│ │ │ ├── np_elemwise_broadcast_op_extended_thi.cc
│ │ │ ├── np_elemwise_broadcast_op_extended_thi.cu
│ │ │ ├── np_elemwise_broadcast_op_lae.cc
│ │ │ ├── np_elemwise_broadcast_op_lae.cu
│ │ │ ├── np_elemwise_broadcast_op_mod.cc
│ │ │ ├── np_elemwise_broadcast_op_mod.cu
│ │ │ ├── np_elemwise_broadcast_op_mul.cc
│ │ │ ├── np_elemwise_broadcast_op_mul.cu
│ │ │ ├── np_elemwise_broadcast_op_pow.cc
│ │ │ ├── np_elemwise_broadcast_op_pow.cu
│ │ │ ├── np_elemwise_broadcast_op_scalar.cc
│ │ │ ├── np_elemwise_broadcast_op_scalar.cu
│ │ │ ├── np_elemwise_broadcast_op_sub.cc
│ │ │ ├── np_elemwise_broadcast_op_sub.cu
│ │ │ ├── np_elemwise_unary_op_basic.cc
│ │ │ ├── np_elemwise_unary_op_basic.cu
│ │ │ ├── np_fill_diagonal_op-inl.h
│ │ │ ├── np_fill_diagonal_op.cc
│ │ │ ├── np_fill_diagonal_op.cu
│ │ │ ├── np_floor_divide.cc
│ │ │ ├── np_floor_divide.cu
│ │ │ ├── np_indexing_op.cc
│ │ │ ├── np_indexing_op.cu
│ │ │ ├── np_indexing_op.h
│ │ │ ├── np_init_op.cc
│ │ │ ├── np_init_op.cu
│ │ │ ├── np_init_op.h
│ │ │ ├── np_insert_op-inl.h
│ │ │ ├── np_insert_op_scalar-inl.h
│ │ │ ├── np_insert_op_scalar.cc
│ │ │ ├── np_insert_op_scalar.cu
│ │ │ ├── np_insert_op_slice-inl.h
│ │ │ ├── np_insert_op_slice.cc
│ │ │ ├── np_insert_op_slice.cu
│ │ │ ├── np_insert_op_tensor-inl.h
│ │ │ ├── np_insert_op_tensor.cc
│ │ │ ├── np_insert_op_tensor.cu
│ │ │ ├── np_interp_op-inl.h
│ │ │ ├── np_interp_op.cc
│ │ │ ├── np_interp_op.cu
│ │ │ ├── np_kron-inl.h
│ │ │ ├── np_kron_backward.cc
│ │ │ ├── np_kron_backward.cu
│ │ │ ├── np_kron_forward.cc
│ │ │ ├── np_kron_forward.cu
│ │ │ ├── np_matmul_op-inl.h
│ │ │ ├── np_matmul_op.cc
│ │ │ ├── np_matmul_op.cu
│ │ │ ├── np_matrix_op-inl.h
│ │ │ ├── np_matrix_op.cc
│ │ │ ├── np_matrix_op.cu
│ │ │ ├── np_memory_op.cc
│ │ │ ├── np_memory_op.cu
│ │ │ ├── np_memory_op.h
│ │ │ ├── np_moments_op.cc
│ │ │ ├── np_moments_op.cu
│ │ │ ├── np_nonzero_op-inl.h
│ │ │ ├── np_nonzero_op.cc
│ │ │ ├── np_nonzero_op.cu
│ │ │ ├── np_pad_op-inl.h
│ │ │ ├── np_pad_op.cc
│ │ │ ├── np_pad_op.cu
│ │ │ ├── np_percentile_op-inl.h
│ │ │ ├── np_percentile_op.cc
│ │ │ ├── np_percentile_op.cu
│ │ │ ├── np_polynomial_op-inl.h
│ │ │ ├── np_polynomial_op.cc
│ │ │ ├── np_polynomial_op.cu
│ │ │ ├── np_repeat_op-inl.h
│ │ │ ├── np_repeat_op.cc
│ │ │ ├── np_repeat_op.cu
│ │ │ ├── np_tensordot_op-inl.h
│ │ │ ├── np_tensordot_op.cc
│ │ │ ├── np_tensordot_op.cu
│ │ │ ├── np_trace_op-inl.h
│ │ │ ├── np_trace_op.cc
│ │ │ ├── np_trace_op.cu
│ │ │ ├── np_tri_op-inl.h
│ │ │ ├── np_tri_op.cc
│ │ │ ├── np_tri_op.cu
│ │ │ ├── np_tril_op-inl.h
│ │ │ ├── np_tril_op.cc
│ │ │ ├── np_tril_op.cu
│ │ │ ├── np_triu_op-inl.h
│ │ │ ├── np_triu_op.cc
│ │ │ ├── np_triu_op.cu
│ │ │ ├── np_true_divide-inl.h
│ │ │ ├── np_true_divide.cc
│ │ │ ├── np_true_divide.cu
│ │ │ ├── np_unique_op.cc
│ │ │ ├── np_unique_op.cu
│ │ │ ├── np_unique_op.h
│ │ │ ├── np_where_backward_op.cc
│ │ │ ├── np_where_backward_op.cu
│ │ │ ├── np_where_forward_op.cc
│ │ │ ├── np_where_forward_op.cu
│ │ │ ├── np_where_op-inl.h
│ │ │ ├── np_window_op.cc
│ │ │ ├── np_window_op.cu
│ │ │ ├── np_window_op.h
│ │ │ └── random/
│ │ │ ├── dist_common.cc
│ │ │ ├── dist_common.cu
│ │ │ ├── dist_common.h
│ │ │ ├── np_bernoulli_op.cc
│ │ │ ├── np_bernoulli_op.cu
│ │ │ ├── np_bernoulli_op.h
│ │ │ ├── np_choice_op.cc
│ │ │ ├── np_choice_op.cu
│ │ │ ├── np_choice_op.h
│ │ │ ├── np_exponential_op.cc
│ │ │ ├── np_exponential_op.cu
│ │ │ ├── np_exponential_op.h
│ │ │ ├── np_gamma_op.cc
│ │ │ ├── np_gamma_op.cu
│ │ │ ├── np_gamma_op.h
│ │ │ ├── np_laplace_op.cc
│ │ │ ├── np_laplace_op.cu
│ │ │ ├── np_laplace_op.h
│ │ │ ├── np_location_scale_op.cc
│ │ │ ├── np_location_scale_op.cu
│ │ │ ├── np_location_scale_op.h
│ │ │ ├── np_multinomial_op.cc
│ │ │ ├── np_multinomial_op.cu
│ │ │ ├── np_multinomial_op.h
│ │ │ ├── np_normal_op.cc
│ │ │ ├── np_normal_op.cu
│ │ │ ├── np_normal_op.h
│ │ │ ├── np_pareto_op.cc
│ │ │ ├── np_pareto_op.cu
│ │ │ ├── np_pareto_op.h
│ │ │ ├── np_power_op.cc
│ │ │ ├── np_power_op.cu
│ │ │ ├── np_power_op.h
│ │ │ ├── np_rayleigh_op.cc
│ │ │ ├── np_rayleigh_op.cu
│ │ │ ├── np_rayleigh_op.h
│ │ │ ├── np_uniform_op.cc
│ │ │ ├── np_uniform_op.cu
│ │ │ ├── np_uniform_op.h
│ │ │ ├── np_weibull_op.cc
│ │ │ ├── np_weibull_op.cu
│ │ │ └── np_weibull_op.h
│ │ ├── operator.cc
│ │ ├── operator_common.h
│ │ ├── operator_tune-inl.h
│ │ ├── operator_tune.cc
│ │ ├── operator_tune.h
│ │ ├── operator_util.cc
│ │ ├── optimizer_op-inl.h
│ │ ├── optimizer_op.cc
│ │ ├── optimizer_op.cu
│ │ ├── pad-inl.h
│ │ ├── pad.cc
│ │ ├── pad.cu
│ │ ├── quantization/
│ │ │ ├── calibrate-inl.h
│ │ │ ├── calibrate.cc
│ │ │ ├── dequantize-inl.h
│ │ │ ├── dequantize.cc
│ │ │ ├── dequantize.cu
│ │ │ ├── dnnl/
│ │ │ │ ├── dnnl_dequantize-inl.h
│ │ │ │ ├── dnnl_quantize-inl.h
│ │ │ │ ├── dnnl_quantize_asym-inl.h
│ │ │ │ ├── dnnl_quantize_v2-inl.h
│ │ │ │ ├── dnnl_quantized_act.cc
│ │ │ │ ├── dnnl_quantized_batch_norm.cc
│ │ │ │ ├── dnnl_quantized_concat.cc
│ │ │ │ ├── dnnl_quantized_conv.cc
│ │ │ │ ├── dnnl_quantized_elemwise_add.cc
│ │ │ │ ├── dnnl_quantized_flatten.cc
│ │ │ │ ├── dnnl_quantized_fully_connected.cc
│ │ │ │ ├── dnnl_quantized_ops-inl.h
│ │ │ │ ├── dnnl_quantized_pooling.cc
│ │ │ │ ├── dnnl_quantized_reshape.cc
│ │ │ │ ├── dnnl_quantized_rnn-inl.h
│ │ │ │ ├── dnnl_quantized_rnn.cc
│ │ │ │ ├── dnnl_quantized_transpose.cc
│ │ │ │ └── dnnl_requantize-inl.h
│ │ │ ├── quantization_utils.h
│ │ │ ├── quantize-inl.h
│ │ │ ├── quantize.cc
│ │ │ ├── quantize.cu
│ │ │ ├── quantize_asym-inl.h
│ │ │ ├── quantize_asym.cc
│ │ │ ├── quantize_graph_pass.cc
│ │ │ ├── quantize_v2-inl.h
│ │ │ ├── quantize_v2.cc
│ │ │ ├── quantize_v2.cu
│ │ │ ├── quantized_activation.cc
│ │ │ ├── quantized_batch_norm.cc
│ │ │ ├── quantized_batch_norm_relu.cc
│ │ │ ├── quantized_concat.cc
│ │ │ ├── quantized_conv.cc
│ │ │ ├── quantized_conv.cu
│ │ │ ├── quantized_elemwise_add-inl.h
│ │ │ ├── quantized_elemwise_add.cc
│ │ │ ├── quantized_elemwise_mul-inl.h
│ │ │ ├── quantized_elemwise_mul.cc
│ │ │ ├── quantized_flatten-inl.h
│ │ │ ├── quantized_flatten.cc
│ │ │ ├── quantized_flatten.cu
│ │ │ ├── quantized_fully_connected.cc
│ │ │ ├── quantized_fully_connected.cu
│ │ │ ├── quantized_indexing_op.cc
│ │ │ ├── quantized_pooling.cc
│ │ │ ├── quantized_pooling.cu
│ │ │ ├── quantized_reshape-inl.h
│ │ │ ├── quantized_reshape.cc
│ │ │ ├── quantized_rnn-inl.h
│ │ │ ├── quantized_rnn.cc
│ │ │ ├── quantized_transpose.cc
│ │ │ ├── requantize-inl.h
│ │ │ ├── requantize.cc
│ │ │ └── requantize.cu
│ │ ├── random/
│ │ │ ├── multisample_op.cc
│ │ │ ├── multisample_op.cu
│ │ │ ├── multisample_op.h
│ │ │ ├── pdf_op.cc
│ │ │ ├── pdf_op.cu
│ │ │ ├── pdf_op.h
│ │ │ ├── sample_multinomial_op.cc
│ │ │ ├── sample_multinomial_op.cu
│ │ │ ├── sample_multinomial_op.h
│ │ │ ├── sample_op.cc
│ │ │ ├── sample_op.cu
│ │ │ ├── sample_op.h
│ │ │ ├── sampler.h
│ │ │ ├── shuffle_op.cc
│ │ │ ├── shuffle_op.cu
│ │ │ ├── unique_sample_op.cc
│ │ │ └── unique_sample_op.h
│ │ ├── regression_output-inl.h
│ │ ├── regression_output.cc
│ │ ├── regression_output.cu
│ │ ├── rnn-inl.h
│ │ ├── rnn.cc
│ │ ├── rnn.cu
│ │ ├── rnn_impl.h
│ │ ├── roi_pooling-inl.h
│ │ ├── roi_pooling.cc
│ │ ├── roi_pooling.cu
│ │ ├── sequence_last-inl.h
│ │ ├── sequence_last.cc
│ │ ├── sequence_last.cu
│ │ ├── sequence_mask-inl.h
│ │ ├── sequence_mask.cc
│ │ ├── sequence_mask.cu
│ │ ├── sequence_op_common.h
│ │ ├── sequence_reverse-inl.h
│ │ ├── sequence_reverse.cc
│ │ ├── sequence_reverse.cu
│ │ ├── slice_channel-inl.h
│ │ ├── slice_channel.cc
│ │ ├── slice_channel.cu
│ │ ├── softmax_output-inl.h
│ │ ├── softmax_output.cc
│ │ ├── softmax_output.cu
│ │ ├── spatial_transformer-inl.h
│ │ ├── spatial_transformer.cc
│ │ ├── spatial_transformer.cu
│ │ ├── special_functions-inl.h
│ │ ├── subgraph/
│ │ │ ├── build_subgraph.cc
│ │ │ ├── common.h
│ │ │ ├── default_subgraph_property.cc
│ │ │ ├── default_subgraph_property_v2.cc
│ │ │ ├── dnnl/
│ │ │ │ ├── dnnl_batch_dot.cc
│ │ │ │ ├── dnnl_batch_dot_property.h
│ │ │ │ ├── dnnl_bn_relu.cc
│ │ │ │ ├── dnnl_bn_relu_property.h
│ │ │ │ ├── dnnl_common.h
│ │ │ │ ├── dnnl_conv-inl.h
│ │ │ │ ├── dnnl_conv.cc
│ │ │ │ ├── dnnl_conv_property.h
│ │ │ │ ├── dnnl_fc-inl.h
│ │ │ │ ├── dnnl_fc.cc
│ │ │ │ ├── dnnl_fc_property.h
│ │ │ │ ├── dnnl_fc_sum_fuse_property.h
│ │ │ │ ├── dnnl_identity_property.h
│ │ │ │ ├── dnnl_post_amp_property.h
│ │ │ │ ├── dnnl_post_quantize_align_scale_property.h
│ │ │ │ ├── dnnl_post_quantize_property.h
│ │ │ │ ├── dnnl_pow_mul_scalar.cc
│ │ │ │ ├── dnnl_pow_mul_scalar_property.h
│ │ │ │ ├── dnnl_remove_casts_property.h
│ │ │ │ ├── dnnl_subgraph_base-inl.h
│ │ │ │ ├── dnnl_subgraph_property.cc
│ │ │ │ ├── dnnl_transformer-inl.h
│ │ │ │ ├── dnnl_transformer.cc
│ │ │ │ ├── dnnl_transformer_qk_common.h
│ │ │ │ ├── dnnl_transformer_qk_property.h
│ │ │ │ └── dnnl_transformer_valatt_property.h
│ │ │ ├── eliminate_common_nodes_pass.cc
│ │ │ ├── partitioner/
│ │ │ │ └── custom_subgraph_property.h
│ │ │ ├── static_shape_subgraph_property.cc
│ │ │ ├── subgraph_property.h
│ │ │ └── tensorrt/
│ │ │ ├── nnvm_to_onnx-inl.h
│ │ │ ├── nnvm_to_onnx.cc
│ │ │ ├── onnx_to_tensorrt.cc
│ │ │ ├── onnx_to_tensorrt.h
│ │ │ ├── tensorrt-inl.h
│ │ │ ├── tensorrt.cc
│ │ │ └── tensorrt.cu
│ │ ├── subgraph_op_common.cc
│ │ ├── subgraph_op_common.h
│ │ ├── svm_output-inl.h
│ │ ├── svm_output.cc
│ │ ├── svm_output.cu
│ │ ├── swapaxis-inl.h
│ │ ├── swapaxis.cc
│ │ ├── swapaxis.cu
│ │ ├── tensor/
│ │ │ ├── amp_cast.cc
│ │ │ ├── amp_cast.cu
│ │ │ ├── amp_cast.h
│ │ │ ├── broadcast_reduce-inl.h
│ │ │ ├── broadcast_reduce_minmax_value.cc
│ │ │ ├── broadcast_reduce_minmax_value.cu
│ │ │ ├── broadcast_reduce_norm_value.cc
│ │ │ ├── broadcast_reduce_norm_value.cu
│ │ │ ├── broadcast_reduce_op.cc
│ │ │ ├── broadcast_reduce_op.h
│ │ │ ├── broadcast_reduce_op_index.cc
│ │ │ ├── broadcast_reduce_op_index.cu
│ │ │ ├── broadcast_reduce_op_value.cc
│ │ │ ├── broadcast_reduce_op_value.cu
│ │ │ ├── broadcast_reduce_prod_value.cc
│ │ │ ├── broadcast_reduce_prod_value.cu
│ │ │ ├── broadcast_reduce_sum_value.cc
│ │ │ ├── broadcast_reduce_sum_value.cu
│ │ │ ├── cast_storage-inl.cuh
│ │ │ ├── cast_storage-inl.h
│ │ │ ├── cast_storage.cc
│ │ │ ├── cast_storage.cu
│ │ │ ├── control_flow_op.cc
│ │ │ ├── control_flow_op.cu
│ │ │ ├── control_flow_op.h
│ │ │ ├── diag_op-inl.h
│ │ │ ├── diag_op.cc
│ │ │ ├── diag_op.cu
│ │ │ ├── dot-inl.cuh
│ │ │ ├── dot-inl.h
│ │ │ ├── dot.cc
│ │ │ ├── dot.cu
│ │ │ ├── elemwise_binary_broadcast_op.cc
│ │ │ ├── elemwise_binary_broadcast_op.h
│ │ │ ├── elemwise_binary_broadcast_op_basic.cc
│ │ │ ├── elemwise_binary_broadcast_op_basic.cu
│ │ │ ├── elemwise_binary_broadcast_op_extended.cc
│ │ │ ├── elemwise_binary_broadcast_op_extended.cu
│ │ │ ├── elemwise_binary_broadcast_op_logic.cc
│ │ │ ├── elemwise_binary_broadcast_op_logic.cu
│ │ │ ├── elemwise_binary_op-inl.h
│ │ │ ├── elemwise_binary_op.cc
│ │ │ ├── elemwise_binary_op.h
│ │ │ ├── elemwise_binary_op_basic.cc
│ │ │ ├── elemwise_binary_op_basic.cu
│ │ │ ├── elemwise_binary_op_extended.cc
│ │ │ ├── elemwise_binary_op_extended.cu
│ │ │ ├── elemwise_binary_op_logic.cc
│ │ │ ├── elemwise_binary_op_logic.cu
│ │ │ ├── elemwise_binary_scalar_op.cc
│ │ │ ├── elemwise_binary_scalar_op.h
│ │ │ ├── elemwise_binary_scalar_op_basic.cc
│ │ │ ├── elemwise_binary_scalar_op_basic.cu
│ │ │ ├── elemwise_binary_scalar_op_extended.cc
│ │ │ ├── elemwise_binary_scalar_op_extended.cu
│ │ │ ├── elemwise_binary_scalar_op_logic.cc
│ │ │ ├── elemwise_binary_scalar_op_logic.cu
│ │ │ ├── elemwise_sum.cc
│ │ │ ├── elemwise_sum.cu
│ │ │ ├── elemwise_sum.h
│ │ │ ├── elemwise_unary_op.cc
│ │ │ ├── elemwise_unary_op.h
│ │ │ ├── elemwise_unary_op_basic.cc
│ │ │ ├── elemwise_unary_op_basic.cu
│ │ │ ├── elemwise_unary_op_logexp.cc
│ │ │ ├── elemwise_unary_op_logexp.cu
│ │ │ ├── elemwise_unary_op_pow.cc
│ │ │ ├── elemwise_unary_op_pow.cu
│ │ │ ├── elemwise_unary_op_trig.cc
│ │ │ ├── elemwise_unary_op_trig.cu
│ │ │ ├── histogram-inl.h
│ │ │ ├── histogram.cc
│ │ │ ├── histogram.cu
│ │ │ ├── index_add-inl.h
│ │ │ ├── index_add_backward.cc
│ │ │ ├── index_add_backward.cu
│ │ │ ├── index_add_forward.cc
│ │ │ ├── index_add_forward.cu
│ │ │ ├── index_update-inl.h
│ │ │ ├── index_update.cc
│ │ │ ├── index_update.cu
│ │ │ ├── indexing_op-inl.cuh
│ │ │ ├── indexing_op.cc
│ │ │ ├── indexing_op.cu
│ │ │ ├── indexing_op.h
│ │ │ ├── init_op.cc
│ │ │ ├── init_op.cu
│ │ │ ├── init_op.h
│ │ │ ├── la_op-inl.h
│ │ │ ├── la_op.cc
│ │ │ ├── la_op.cu
│ │ │ ├── la_op.h
│ │ │ ├── matrix_op-inl.h
│ │ │ ├── matrix_op.cc
│ │ │ ├── matrix_op.cu
│ │ │ ├── ordering_op-inl.h
│ │ │ ├── ordering_op.cc
│ │ │ ├── ordering_op.cu
│ │ │ ├── pseudo2DTranspose_op-inl.cuh
│ │ │ ├── ravel.cc
│ │ │ ├── ravel.cu
│ │ │ ├── ravel.h
│ │ │ ├── reduce_rtc.cc
│ │ │ ├── slice-inl.h
│ │ │ ├── sort_op-inl.cuh
│ │ │ ├── sort_op.h
│ │ │ ├── sparse_retain-inl.h
│ │ │ ├── sparse_retain.cc
│ │ │ ├── sparse_retain.cu
│ │ │ ├── square_sum-inl.h
│ │ │ ├── square_sum.cc
│ │ │ ├── square_sum.cu
│ │ │ └── util/
│ │ │ ├── tensor_util-inl.cuh
│ │ │ └── tensor_util-inl.h
│ │ └── tvmop/
│ │ ├── op_module.cc
│ │ └── op_module.h
│ ├── optimizer/
│ │ └── sgd-inl.h
│ ├── profiler/
│ │ ├── aggregate_stats.cc
│ │ ├── aggregate_stats.h
│ │ ├── custom_op_profiler.h
│ │ ├── profiler.cc
│ │ ├── profiler.h
│ │ ├── storage_profiler.cc
│ │ ├── storage_profiler.h
│ │ ├── vtune.cc
│ │ └── vtune.h
│ ├── resource.cc
│ ├── runtime/
│ │ ├── c_runtime_api.cc
│ │ ├── container.cc
│ │ ├── ndarray_handle.cc
│ │ ├── object.cc
│ │ ├── object_internal.h
│ │ └── registry.cc
│ ├── serialization/
│ │ ├── cnpy.cc
│ │ └── cnpy.h
│ └── storage/
│ ├── cpu_device_storage.h
│ ├── cpu_shared_storage_manager.h
│ ├── gpu_device_storage.h
│ ├── naive_storage_manager.h
│ ├── pinned_memory_storage.h
│ ├── pooled_storage_manager.h
│ ├── storage.cc
│ ├── storage_manager.h
│ └── storage_manager_helpers.h
├── tests/
│ ├── .gitignore
│ ├── CMakeLists.txt
│ ├── README.md
│ ├── cpp/
│ │ ├── .gitignore
│ │ ├── engine/
│ │ │ ├── engine_shutdown_test.cc
│ │ │ ├── omp_test.cc
│ │ │ ├── thread_local_test.cc
│ │ │ └── threaded_engine_test.cc
│ │ ├── include/
│ │ │ ├── test_core_op.h
│ │ │ ├── test_dnnl.h
│ │ │ ├── test_legacy_op.h
│ │ │ ├── test_ndarray_utils.h
│ │ │ ├── test_op.h
│ │ │ ├── test_op_runner.h
│ │ │ ├── test_perf.h
│ │ │ ├── test_tune.h
│ │ │ └── test_util.h
│ │ ├── kvstore/
│ │ │ └── gpu_topology_test.cc
│ │ ├── misc/
│ │ │ ├── base.cc
│ │ │ └── libinfo_test.cc
│ │ ├── operator/
│ │ │ ├── activation_perf.cc
│ │ │ ├── batchnorm_test.cc
│ │ │ ├── coreop_perf.cc
│ │ │ ├── dnnl_operator_test.cc
│ │ │ ├── dnnl_test.cc
│ │ │ ├── dropout_perf.cc
│ │ │ ├── fully_conn_perf.cc
│ │ │ ├── krprod_test.cc
│ │ │ ├── runner/
│ │ │ │ └── core_op_runner_test.cc
│ │ │ ├── slice_channel_perf.cc
│ │ │ └── tune/
│ │ │ └── operator_tune_test.cc
│ │ ├── storage/
│ │ │ └── storage_test.cc
│ │ └── test_main.cc
│ ├── nightly/
│ │ ├── .gitignore
│ │ ├── Jenkinsfile
│ │ ├── JenkinsfileForBinaries
│ │ ├── README.md
│ │ ├── TestDoc/
│ │ │ ├── doc_spell_checker.py
│ │ │ └── doc_spell_grammar.sh
│ │ ├── common.py
│ │ ├── dist_async_kvstore.py
│ │ ├── dist_device_sync_kvstore.py
│ │ ├── dist_device_sync_kvstore_byteps.py
│ │ ├── dist_device_sync_kvstore_custom.py
│ │ ├── dist_device_sync_kvstore_horovod.py
│ │ ├── dist_sync_kvstore.py
│ │ ├── estimator/
│ │ │ ├── test_estimator_cnn.py
│ │ │ └── test_sentiment_rnn.py
│ │ ├── model_backwards_compatibility_check/
│ │ │ ├── JenkinsfileForMBCC
│ │ │ ├── README.md
│ │ │ ├── __init__.py
│ │ │ ├── common.py
│ │ │ ├── model_backward_compat_checker.sh
│ │ │ ├── model_backwards_compat_inference.py
│ │ │ ├── model_backwards_compat_train.py
│ │ │ ├── train_mxnet_legacy_models.sh
│ │ │ └── upload_models_to_s3.sh
│ │ ├── test_distributed_training-gpu.sh
│ │ ├── test_kvstore.py
│ │ ├── test_large_array.py
│ │ ├── test_large_vector.py
│ │ ├── test_np_large_array.py
│ │ ├── test_np_random.py
│ │ └── test_server_profiling.py
│ ├── python/
│ │ ├── README.md
│ │ ├── amp/
│ │ │ └── common.py
│ │ ├── array-api/
│ │ │ └── test_data_interchange.py
│ │ ├── common/
│ │ │ └── models.py
│ │ ├── conftest.py
│ │ ├── dnnl/
│ │ │ ├── op_cfg.py
│ │ │ ├── subgraphs/
│ │ │ │ ├── subgraph_common.py
│ │ │ │ ├── test_amp_subgraph.py
│ │ │ │ ├── test_conv_subgraph.py
│ │ │ │ ├── test_fc_subgraph.py
│ │ │ │ ├── test_matmul_subgraph.py
│ │ │ │ └── test_pow_mul_subgraph.py
│ │ │ ├── test_amp.py
│ │ │ ├── test_bf16_operator.py
│ │ │ ├── test_dnnl.py
│ │ │ └── test_quantization_dnnl.py
│ │ ├── doctest/
│ │ │ └── test_docstring.py
│ │ ├── gpu/
│ │ │ ├── test_amp.py
│ │ │ ├── test_amp_init.py
│ │ │ ├── test_deferred_compute_gpu.py
│ │ │ ├── test_device.py
│ │ │ ├── test_extensions_gpu.py
│ │ │ ├── test_fusion.py
│ │ │ ├── test_gluon_gpu.py
│ │ │ ├── test_gluon_model_zoo_gpu.py
│ │ │ ├── test_gluon_transforms.py
│ │ │ ├── test_kvstore_gpu.py
│ │ │ ├── test_nccl.py
│ │ │ ├── test_numpy_einsum.py
│ │ │ ├── test_numpy_fallback.py
│ │ │ ├── test_operator_gpu.py
│ │ │ ├── test_profiler_gpu.py
│ │ │ ├── test_rtc.py
│ │ │ ├── test_tvm_bridge.py
│ │ │ └── test_tvm_op_gpu.py
│ │ ├── onnx/
│ │ │ ├── test_models.py
│ │ │ └── test_operators.py
│ │ ├── profiling/
│ │ │ ├── simple_forward.py
│ │ │ └── test_nvtx.py
│ │ ├── quantization/
│ │ │ └── test_quantization.py
│ │ ├── test_quantization_gpu.py
│ │ ├── train/
│ │ │ ├── common.py
│ │ │ └── test_autograd.py
│ │ └── unittest/
│ │ ├── common.py
│ │ ├── legacy_ndarray.v0
│ │ ├── test_attr.py
│ │ ├── test_autograd.py
│ │ ├── test_base.py
│ │ ├── test_contrib_control_flow.py
│ │ ├── test_contrib_gluon_data_vision.py
│ │ ├── test_contrib_hawkesll.py
│ │ ├── test_contrib_intgemm.py
│ │ ├── test_contrib_io.py
│ │ ├── test_contrib_krprod.py
│ │ ├── test_contrib_operator.py
│ │ ├── test_contrib_optimizer.py
│ │ ├── test_contrib_stes_op.py
│ │ ├── test_deferred_compute.py
│ │ ├── test_dgl_graph.py
│ │ ├── test_dynamic_shape.py
│ │ ├── test_engine.py
│ │ ├── test_engine_import.py
│ │ ├── test_exc_handling.py
│ │ ├── test_executor.py
│ │ ├── test_extensions.py
│ │ ├── test_ffi_container.py
│ │ ├── test_gluon.py
│ │ ├── test_gluon_batch_processor.py
│ │ ├── test_gluon_control_flow.py
│ │ ├── test_gluon_data.py
│ │ ├── test_gluon_estimator.py
│ │ ├── test_gluon_event_handler.py
│ │ ├── test_gluon_indexing.py
│ │ ├── test_gluon_model_zoo.py
│ │ ├── test_gluon_probability_v2.py
│ │ ├── test_gluon_rnn.py
│ │ ├── test_gluon_save.py
│ │ ├── test_gluon_trainer.py
│ │ ├── test_gluon_utils.py
│ │ ├── test_higher_order_grad.py
│ │ ├── test_image.py
│ │ ├── test_infer_shape.py
│ │ ├── test_infer_type.py
│ │ ├── test_io.py
│ │ ├── test_kvstore.py
│ │ ├── test_kvstore_custom.py
│ │ ├── test_loss.py
│ │ ├── test_memory_opt.py
│ │ ├── test_metric.py
│ │ ├── test_ndarray.py
│ │ ├── test_numpy_contrib_gluon_data_vision.py
│ │ ├── test_numpy_default_dtype.py
│ │ ├── test_numpy_gluon.py
│ │ ├── test_numpy_gluon_data_vision.py
│ │ ├── test_numpy_interoperability.py
│ │ ├── test_numpy_loss.py
│ │ ├── test_numpy_ndarray.py
│ │ ├── test_numpy_op.py
│ │ ├── test_operator.py
│ │ ├── test_optimizer.py
│ │ ├── test_profiler.py
│ │ ├── test_random.py
│ │ ├── test_recordio.py
│ │ ├── test_runtime.py
│ │ ├── test_smoke.py
│ │ ├── test_sparse_ndarray.py
│ │ ├── test_sparse_operator.py
│ │ ├── test_subgraph.py
│ │ ├── test_subgraph_op.py
│ │ ├── test_symbol.py
│ │ ├── test_test_utils.py
│ │ ├── test_thread_local.py
│ │ ├── test_tvm_op.py
│ │ └── test_viz.py
│ ├── tutorials/
│ │ ├── test_sanity_tutorials.py
│ │ └── test_tutorials.py
│ └── utils/
│ └── notebook_test/
│ └── __init__.py
└── tools/
├── bandwidth/
│ ├── .gitignore
│ ├── README.md
│ ├── measure.py
│ └── test_measure.py
├── cfn/
│ └── Readme.md
├── create_source_archive.sh
├── dependencies/
│ ├── LICENSE.binary.dependencies
│ ├── README.md
│ ├── cityhash.sh
│ ├── curl.sh
│ ├── eigen.sh
│ ├── libpng.sh
│ ├── libtiff.sh
│ ├── libturbojpeg.sh
│ ├── libz.sh
│ ├── lz4.sh
│ ├── make_shared_dependencies.sh
│ ├── mkl.sh
│ ├── numpy_mkl.sh
│ ├── openblas.sh
│ ├── opencv.sh
│ ├── openssl.sh
│ ├── patch/
│ │ └── opencv_lapack.h
│ ├── protobuf.sh
│ └── zmq.sh
├── diagnose.py
├── flakiness_checker.py
├── git-pre-commit
├── im2rec.cc
├── im2rec.py
├── ipynb2md.py
├── kill-mxnet.py
├── launch.py
├── license_header.py
├── lint/
│ ├── clang_format_ci.sh
│ └── git-clang-format-13
├── parse_log.py
├── pip/
│ ├── MANIFEST.in
│ ├── doc/
│ │ ├── CPU_ADDITIONAL.md
│ │ ├── CU101_ADDITIONAL.md
│ │ ├── CU102_ADDITIONAL.md
│ │ ├── CU110_ADDITIONAL.md
│ │ ├── CU112_ADDITIONAL.md
│ │ ├── NATIVE_ADDITIONAL.md
│ │ └── PYPI_README.md
│ ├── sanity_test.py
│ └── setup.py
├── profile/
│ └── tune_python.sh
├── rec2idx.py
├── staticbuild/
│ ├── README.md
│ ├── build.sh
│ ├── build_lib.sh
│ └── build_wheel.sh
└── windowsbuild/
├── README.md
├── gen_warp.cpp
└── warp_dll.cpp
================================================
FILE CONTENTS
================================================
================================================
FILE: .asf.yaml
================================================
notifications:
commits: commits@mxnet.apache.org
issues: issues@mxnet.apache.org
pullrequests: commits@mxnet.apache.org
github:
features:
wiki: true
issues: true
projects: true
enabled_merge_buttons:
squash: true
merge: false
rebase: true
================================================
FILE: .clang-format
================================================
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
---
Language: Cpp
BasedOnStyle: Google
ColumnLimit: 100
AlignConsecutiveAssignments: true
AlignConsecutiveDeclarations: false
AlignConsecutiveMacros: true
DerivePointerAlignment: false
SortIncludes: true
MaxEmptyLinesToKeep: 1
PointerAlignment: Left
AllowAllParametersOfDeclarationOnNextLine: false
AllowShortBlocksOnASingleLine: false
AllowShortCaseLabelsOnASingleLine: false
AllowShortFunctionsOnASingleLine: Empty
AllowShortIfStatementsOnASingleLine: false
AllowShortLoopsOnASingleLine: false
AlwaysBreakAfterReturnType: None
AlwaysBreakBeforeMultilineStrings: true
AlwaysBreakTemplateDeclarations: true
BinPackArguments: false
BinPackParameters: false
SortIncludes: false
BreakBeforeTernaryOperators: false
---
Language: JavaScript
DisableFormat: true
================================================
FILE: .clang-tidy
================================================
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
# The checks defined here will be run and will display by default as warnings.
Checks: >
-*, cppcoreguidelines-* clang-analyzer-*, modernize-*,
performance-faster-string-find, performance-for-range-copy,
performance-implicit-conversion-in-loop, performance-inefficient-algorithm,
performance-inefficient-string-concatenation, performance-trivially-destructible,
performance-inefficient-vector-operation, performance-move-const-arg,
performance-move-constructor-init, performance-noexcept-move-constructor,
performance-no-automatic-move, performance-unnecessary-copy-initialization,
performance-type-promotion-in-math-fn
# performance checks not enabled due to segmentation fault in clang-tidy v8+:
# performance-unnecessary-value-param
# In order to trigger an error, you must have a rule defined both in checks and in this section.
WarningsAsErrors: >
cppcoreguidelines-no-malloc, modernize-deprecated-headers,
modernize-loop-convert, modernize-make-shared, modernize-pass-by-value, modernize-make-unique,
modernize-raw-string-literal, modernize-redundant-void-arg, modernize-replace-auto-ptr,
modernize-replace-random-shuffle, modernize-return-braced-init-list, modernize-shrink-to-fit,
modernize-unary-static-assert, modernize-use-bool-literals, modernize-use-default-member-init,
modernize-use-emplace, modernize-use-equals-default, modernize-use-equals-delete,
modernize-use-noexcept, modernize-use-nullptr, modernize-use-override,
modernize-use-transparent-functors, modernize-use-using,
performance-faster-string-find, performance-implicit-conversion-in-loop,
performance-inefficient-algorithm, performance-inefficient-string-concatenation,
performance-trivially-destructible, performance-inefficient-vector-operation,
performance-move-const-arg, performance-move-constructor-init,
performance-noexcept-move-constructor, performance-no-automatic-move,
performance-unnecessary-copy-initialization, performance-type-promotion-in-math-fn
# modernize checks not enforced:
# modernize-use-auto
# modernize-avoid-bind
# performance checks not enforced due to segmentation fault
# performance-for-range-copy
# Todo: define a better regex match that includes most project headers, but excludes third party
# code.
HeaderFilterRegex: '^src/.*'
================================================
FILE: .cmakelintrc
================================================
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#
# build and install are separated so changes to build don't invalidate
# the whole docker cache for the image
# --filter= options: https://pypi.org/project/cmakelint/
# "-" disable option
# "+" enable option
filter=-convention/filename,-linelength,-package/consistency,-readability/logic,-readability/mixedcase,-readability/wonkycase,-syntax,-whitespace/eol,+whitespace/extra,-whitespace/indent,-whitespace/mismatch,-whitespace/newline,-whitespace/tabs
================================================
FILE: .codecov.yml
================================================
# Codecov.io configuration file
# See https://docs.codecov.io/docs/codecovyml-reference
codecov:
notify:
require_ci_to_pass: yes
coverage:
status:
project: off
patch: on
precision: 2
round: down
range: "70...100"
parsers:
gcov:
branch_detection:
conditional: yes
loop: yes
method: no
macro: no
ignore:
- "tests/**/*"
# Disable comments for now to gather data in the background
comment: false
# layout: "header, diff"
# behavior: default
# require_changes: no
================================================
FILE: .git-blame-ignore-revs
================================================
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
# Clang-formatter initial commit - /src directory is formatted
e359bcd65e453d4bc86d3d8e5b1dee3916a2e426
# Clang-formatter initial commit - OneDNN files
718a860f3aa8f24acca2aec867a3b31bc60a6e79
================================================
FILE: .gitattributes
================================================
.gitattributes export-ignore
R-package/* export-ignore
================================================
FILE: .github/ISSUE_TEMPLATE/bug_report.md
================================================
---
name: Bug report
about: Create a report to help us improve
title: ''
labels: 'Bug, needs triage'
assignees: ''
---
## Description
(A clear and concise description of what the bug is.)
### Error Message
(Paste the complete error message. Please also include stack trace by setting environment variable `DMLC_LOG_STACK_TRACE_DEPTH=100` before running your script.)
## To Reproduce
(If you developed your own code, please provide a short script that reproduces the error. For existing examples, please provide link.)
### Steps to reproduce
(Paste the commands you ran that produced the error.)
1.
2.
## What have you tried to solve it?
1.
2.
## Environment
***We recommend using our script for collecting the diagnostic information with the following command***
`curl --retry 10 -s https://raw.githubusercontent.com/apache/incubator-mxnet/master/tools/diagnose.py | python3`
Environment Information
```
# Paste the diagnose.py command output here
```
================================================
FILE: .github/ISSUE_TEMPLATE/config.yml
================================================
blank_issues_enabled: false
contact_links:
- name: GitHub Discussions
url: https://github.com/apache/mxnet/discussions
about: Use GitHub Discussions to ask and answer questions, exchange ideas, and share learning.
- name: Discourse Forum
url: https://discuss.mxnet.io/
about: Discuss forum for usage questions.
- name: Stack Overflow
url: https://stackoverflow.com/questions/tagged/mxnet
about: Ask and answer usage questions on Stack Overflow
================================================
FILE: .github/ISSUE_TEMPLATE/feature_request.md
================================================
---
name: Feature request
about: Suggest an idea for this project
title: ''
labels: 'Feature request'
assignees: ''
---
## Description
(A clear and concise description of what the feature is.)
- If the proposal is about a new model, provide description of what the model is.
- If the proposal is about an API, provide mock examples if possible.
## References
- list reference and related literature
- list known implementations
================================================
FILE: .github/ISSUE_TEMPLATE/flaky_test.md
================================================
---
name: Flaky test
about: Report a flaky test
title: ''
labels: 'Flaky'
assignees: ''
---
## Description
(The location and name of the flaky test.)
## Occurrences
(Links to the known occurrences.)
## What have you tried to solve it?
1.
2.
================================================
FILE: .github/ISSUE_TEMPLATE/rfc.md
================================================
---
name: Request for comment (RFC)
about: RFC process requests for review on the design of a new feature or bug fix that involves more efforts. This thread is automatically mirrored to the dev@mxnet.apache.org mailing list.
title: '[RFC] '
labels: 'RFC'
assignees: ''
---
## Problem statement
(A clear and concise description of what this contribution is trying to solve.)
## Proposed solutions
(Description of the approach this contribution takes to solve the problem.)
## References
- list reference and related literature
- list known implementations
================================================
FILE: .github/PULL_REQUEST_TEMPLATE.md
================================================
## Description ##
(Brief description on what this PR is about)
## Checklist ##
### Essentials ###
- [ ] PR's title starts with a category (e.g. [BUGFIX], [MODEL], [TUTORIAL], [FEATURE], [DOC], etc)
- [ ] Changes are complete (i.e. I finished coding on this PR)
- [ ] All changes have test coverage
- [ ] Code is well-documented
### Changes ###
- [ ] Feature1, tests, (and when applicable, API doc)
- [ ] Feature2, tests, (and when applicable, API doc)
## Comments ##
- If this change is a backward incompatible change, why must this change be made.
- Interesting edge cases to note here
================================================
FILE: .github/workflows/greetings.yml
================================================
name: Greetings
on: [pull_request, issues]
jobs:
greeting:
runs-on: ubuntu-latest
steps:
- uses: actions/first-interaction@v1
env:
GITHUB_PR_BASE_SHA: ${{ github.event.pull_request.base.sha }}
GITHUB_PR_RUN_ID: ${{ github.run_id }}
GITHUB_PR_BASE_REF: ${{ github.event.pull_request.base.ref }}
with:
repo-token: ${{ secrets.GITHUB_TOKEN }}
issue-message: |
Welcome to Apache MXNet (incubating)! We are on a mission to democratize AI, and we are glad that you are contributing to it by opening this issue.
Please make sure to include all the relevant context, and one of the @apache/mxnet-committers will be here shortly.
If you are interested in contributing to our project, let us know! Also, be sure to check out our guide on [contributing to MXNet](https://mxnet.apache.org/community/contribute) and our [development guides wiki](https://cwiki.apache.org/confluence/display/MXNET/Developments).
pr-message: |
Welcome to Apache MXNet (incubating)! We are on a mission to democratize AI, and we are glad that you are contributing to it by opening this pull request.
Please make sure that the changes are covered by tests. One of the @apache/mxnet-committers will be here shortly.
If you run into any issue with the CI and tests, we recommend that you first check out our guide on [developer guides wiki](https://cwiki.apache.org/confluence/display/MXNET/Developments).
Let our @apache/mxnet-committers know if you need any help!
================================================
FILE: .github/workflows/license_check.yml
================================================
name: license check
on: [push, pull_request]
defaults:
run:
shell: bash
jobs:
licensecheck:
runs-on: ubuntu-latest
strategy:
fail-fast: false
steps:
- name: Checkout repository
uses: actions/checkout@v2
- name: Update Submodules
run: |
git submodule update --init --recursive
- name: Check License Header
uses: apache/skywalking-eyes@main
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
================================================
FILE: .github/workflows/link_check.yml
================================================
name: link check
on: [push, pull_request]
defaults:
run:
shell: bash
jobs:
linkcheck:
runs-on: ubuntu-20.04
strategy:
fail-fast: false
steps:
- name: Checkout repository
uses: actions/checkout@v2
- name: Compilation cache
uses: actions/cache@v2
with:
path: ~/.ccache
# We include the commit sha in the cache key, as new cache entries are
# only created if there is no existing entry for the key yet.
key: ${{ runner.os }}-ccache-${{ github.sha }}
# Restore any ccache cache entry, if none for
# ${{ runner.os }}-ccache-${{ github.sha }} exists
restore-keys: |
${{ runner.os }}-ccache
- name: Setup python
uses: actions/setup-python@v2
with:
python-version: '3.8'
architecture: x64
- name: Install Dependencies
run: |
sudo apt-get update
sudo apt-get install -y libopenblas-dev ninja-build ccache python3-sphinx \
pandoc gcc-7 g++-7 libopencv-dev protobuf-compiler libprotobuf-dev
ccache -M 500M # Limit the ccache size; Github's overall cache limit is 5GB
python -m pip install pandoc-attributes==0.1.7
python -m pip install -r docs/python_docs/requirements
python -m pip install docs/python_docs/themes/mx-theme
shell: bash
- name: Build project
env:
CC: gcc-7
CXX: g++-7
run: |
git submodule update --init --recursive
mkdir build; cd build
CXXFLAGS="-Wno-error=strict-overflow" cmake \
-DCMAKE_BUILD_TYPE="RelWithDebInfo" \
-DUSE_ONEDNN=OFF \
-DUSE_CUDA=OFF \
-G Ninja ..
ninja
cd ..
shell: bash
- name: Setup Python
run: |
python -m pip install --user -e python
- name: Link Check
env:
MAX_RETRY: 3
run: |
for run in {1..$MAX_RETRY}
do
cd docs/python_docs/python
make clean
timeout 10m make linkcheck EVAL=0
if [[ $? -eq 0 ]]
then
break
else
if [[ run -eq $MAX_RETRY ]]
then
exit 1
fi
fi
done
================================================
FILE: .github/workflows/os_x_mklbuild.yml
================================================
name: mkl continuous build
on: [push, pull_request]
jobs:
macosx-x86_64:
runs-on: macos-10.15
steps:
- name: Checkout repository
uses: actions/checkout@v2
- name: Compilation cache
uses: actions/cache@v2
with:
path: ~/.ccache
# We include the commit sha in the cache key, as new cache entries are
# only created if there is no existing entry for the key yet.
key: ${{ runner.os }}-ccache-${{ github.sha }}
# Restore any ccache cache entry, if none for
# ${{ runner.os }}-ccache-${{ github.sha }} exists
restore-keys: |
${{ runner.os }}-ccache
- name: Setup python
uses: actions/setup-python@v2
with:
python-version: '3.8'
architecture: x64
- name: Install Dependencies
run: |
brew install nasm automake ninja libtool cmake pkgconfig protobuf hdf5 zlib ccache
ccache -M 500M # Limit the ccache size; Github's overall cache limit is 5GB
python -m pip install -r ci/docker/install/requirements
shell: bash
- name: Build project
run: |
./tools/staticbuild/build.sh cpu mkl
- name: Setup Python
run: |
python -m pip install --user -e python
- name: Test project
run: |
python -m pytest -n 4 --durations=50 --verbose tests/python/unittest/ -k 'not test_operator and not (test_subgraph or test_custom_op or test_external_op or test_recordimage_dataset_with_data_loader_multiworker or test_multi_worker or test_multi_worker_shape or test_multi_worker_forked_data_loader or test_multi_worker_dataloader_release_pool)' -m 'not serial'
MXNET_ENGINE_TYPE=NaiveEngine python -m pytest -n 4 --durations=50 --verbose tests/python/unittest/ -k 'test_operator and not (test_subgraph or test_custom_op or test_external_op or test_recordimage_dataset_with_data_loader_multiworker or test_multi_worker or test_multi_worker_shape or test_multi_worker_forked_data_loader or test_multi_worker_dataloader_release_pool)' -m 'not serial'
python -m pytest --durations=50 --verbose tests/python/unittest/ -k 'not (test_subgraph or test_custom_op or test_external_op or test_recordimage_dataset_with_data_loader_multiworker or test_multi_worker or test_multi_worker_shape or test_multi_worker_forked_data_loader or test_multi_worker_dataloader_release_pool)' -m 'serial'
python -m pytest -n 4 --durations=50 --verbose tests/python/dnnl -k 'not (test_bf16_operator or test_amp or test_amp_subgraph)'
================================================
FILE: .github/workflows/os_x_staticbuild.yml
================================================
name: continuous build
on: [push, pull_request]
jobs:
macosx-x86_64:
runs-on: macos-latest
steps:
- name: Checkout repository
uses: actions/checkout@v2
- name: Compilation cache
uses: actions/cache@v2
with:
path: ~/.ccache
# We include the commit sha in the cache key, as new cache entries are
# only created if there is no existing entry for the key yet.
key: ${{ runner.os }}-ccache-${{ github.sha }}
# Restore any ccache cache entry, if none for
# ${{ runner.os }}-ccache-${{ github.sha }} exists
restore-keys: |
${{ runner.os }}-ccache
- name: Setup python
uses: actions/setup-python@v2
with:
python-version: '3.8'
architecture: x64
- name: Install Dependencies
run: |
brew install nasm automake ninja libtool cmake pkgconfig protobuf hdf5 zlib ccache
ccache -M 500M # Limit the ccache size; Github's overall cache limit is 5GB
python -m pip install -r ci/docker/install/requirements
shell: bash
- name: Build project
run: |
CMAKE_STATICBUILD=1 ./tools/staticbuild/build.sh cpu
- name: Setup Python
run: |
python -m pip install --user -e python
- name: Build with Cython
run: |
cd python
python setup.py build_ext --inplace --with-cython
- name: Test project
env:
MXNET_ENABLE_CYTHON: 1
run: |
python3 -m pytest -n 4 --durations=50 --verbose tests/python/unittest/ -k 'not test_operator and not (test_subgraph or test_custom_op or test_external_op or test_recordimage_dataset_with_data_loader_multiworker or test_multi_worker or test_multi_worker_shape or test_multi_worker_forked_data_loader or test_multi_worker_dataloader_release_pool)' -m 'not serial'
MXNET_ENGINE_TYPE=NaiveEngine python3 -m pytest -n 4 --durations=50 --verbose tests/python/unittest/ -k 'test_operator and not (test_subgraph or test_custom_op or test_external_op or test_recordimage_dataset_with_data_loader_multiworker or test_multi_worker or test_multi_worker_shape or test_multi_worker_forked_data_loader or test_multi_worker_dataloader_release_pool)' -m 'not serial'
python3 -m pytest --durations=50 --verbose tests/python/unittest/ -k 'not (test_subgraph or test_custom_op or test_external_op or test_recordimage_dataset_with_data_loader_multiworker or test_multi_worker or test_multi_worker_shape or test_multi_worker_forked_data_loader or test_multi_worker_dataloader_release_pool)' -m 'serial'
- name: Test Array API
env:
MXNET_ENABLE_CYTHON: 1
run: |
cd ..
git clone https://github.com/data-apis/array-api-tests.git
cd array-api-tests
git checkout c1dba80a196a03f880d2e0a998a272fb3867b720
export ARRAY_API_TESTS_MODULE=mxnet.numpy pytest
export DMLC_LOG_STACK_TRACE_DEPTH=100
python3 -m pytest --reruns 3 --durations=50 --verbose array_api_tests/test_creation_functions.py
python3 -m pytest --reruns 3 --durations=50 --verbose array_api_tests/test_indexing.py
python3 -m pytest --reruns 3 --durations=50 --verbose array_api_tests/test_constants.py
python3 -m pytest --reruns 3 --durations=50 --verbose array_api_tests/test_elementwise_functions.py
python3 -m pytest --reruns 3 --durations=50 --verbose array_api_tests/test_broadcasting.py
python3 -m pytest --reruns 3 --durations=50 --verbose \
array_api_tests/test_type_promotion.py::test_elementwise_function_two_arg_bool_type_promotion
python3 -m pytest --reruns 3 --durations=50 --verbose \
array_api_tests/test_type_promotion.py::test_elementwise_function_two_arg_promoted_type_promotion
python3 -m pytest --reruns 3 --durations=50 --verbose \
array_api_tests/test_type_promotion.py::test_elementwise_function_one_arg_bool
python3 -m pytest --reruns 3 --durations=50 --verbose \
array_api_tests/test_type_promotion.py::test_elementwise_function_one_arg_type_promotion
python3 -m pytest --reruns 3 --durations=50 --verbose \
array_api_tests/test_type_promotion.py::test_operator_one_arg_type_promotion
python3 -m pytest --reruns 3 --durations=50 --verbose \
array_api_tests/test_type_promotion.py::test_operator_two_arg_bool_promotion
python3 -m pytest --reruns 3 --durations=50 --verbose \
array_api_tests/test_type_promotion.py::test_operator_two_arg_promoted_promotion
python3 -m pytest --reruns 3 --durations=50 --verbose \
array_api_tests/test_type_promotion.py::test_operator_inplace_two_arg_promoted_promotion
================================================
FILE: .gitignore
================================================
# Compiled Object files
*.slo
*.lo
*.o
*.obj
# Precompiled Headers
*.gch
*.pch
# Compiled Dynamic libraries
*.so
*.dylib
*.dll
# Fortran module files
*.mod
# Compiled Static libraries
*.lai
*.la
*.a
*.lib
# Executables
*.exe
*.out
*.app
*~
# doc
doc/html
doc/latex
doc/doc
docs/web-data
.jekyll-cache
*.lock
#dmlc
config.mk
config.cmake
*.pyc
.Rhistory
*log
Debug
*suo
tracker
# vim
*.swp
*.swo
*.swn
.vimrc
.ycm_extra_conf.py
.ycm_extra_conf.pyc
# Emacs
.#*
.clang_complete
.dir-locals.el
__pycache__
*.pkl
*.params
*.states
*.json
*.d
cmake-build*
data
model
recommonmark
# R
*.Rcheck
*.rds
*.Rproj
.Rproj.user
R-package/inst/*
*.tar.gz
*.tgz
R-package/man/*.Rd
R-package/R/mxnet_generated.R
# data
*.rec
*.lst
*.zip
*ubyte
*.bin
*.txt
!CMakeLists.txt
# ipython notebook
*_pb2.py
*.ipynb_checkpoints*
input.txt*
# Jetbrain
.idea
.gradle
*.iml
# ctags
tags
# cscope
cscope.out
cscope.files
# Eclipse project config
.project
.cproject
.classpath
.settings
.pydevproject
CMakeFiles
cmake_install.cmake
# Visual Studio Code
.vscode
# Mac OS X
.DS_Store
# Windows
windows_package.7z
windows_package
#Notebook Automated Test
!tests/nightly/test_tutorial_config.txt
!tests/nightly/TestNotebook
tests/nightly/tmp_notebook
# pip building tools
tools/pip_package/build
tools/pip_package/dist
tools/pip_package/mxnet.egg-info
tools/pip_package/mxnet
# temporary path for building dependencies when building wheel
deps/
staticdeps/
tmp/
build/
lib/
bin/
model/
# VTune
./r0*hs
# generated function signature for IDE auto-complete
python/mxnet/symbol/gen_*
python/mxnet/ndarray/gen_*
python/.eggs
# tests if built insource
*CTestTestfile.cmake
*DartConfiguration.tcl
tests/Makefile
tests/mxnet_unit_tests
# Code coverage related
.coverage
*.gcov
*.gcno
coverage.xml
# Local CMake build config
cmake_options.yml
# header file generated at compile time
include/onednn/oneapi/dnnl/dnnl_version.h
include/onednn/oneapi/dnnl/dnnl_config.h
================================================
FILE: .gitmodules
================================================
[submodule "3rdparty/dmlc-core"]
path = 3rdparty/dmlc-core
url = https://github.com/dmlc/dmlc-core.git
[submodule "3rdparty/ps-lite"]
path = 3rdparty/ps-lite
url = https://github.com/dmlc/ps-lite
[submodule "3rdparty/dlpack"]
path = 3rdparty/dlpack
url = https://github.com/dmlc/dlpack
[submodule "3rdparty/googletest"]
path = 3rdparty/googletest
url = https://github.com/google/googletest.git
[submodule "3rdparty/tvm"]
path = 3rdparty/tvm
url = https://github.com/apache/incubator-tvm.git
[submodule "3rdparty/onnx-tensorrt"]
path = 3rdparty/onnx-tensorrt
url = https://github.com/onnx/onnx-tensorrt.git
[submodule "3rdparty/nvidia_cub"]
path = 3rdparty/nvidia_cub
url = https://github.com/NVlabs/cub.git
[submodule "3rdparty/libzip"]
path = 3rdparty/libzip
url = https://github.com/nih-at/libzip.git
[submodule "3rdparty/intgemm"]
path = 3rdparty/intgemm
url = https://github.com/kpu/intgemm
[submodule "3rdparty/onednn"]
path = 3rdparty/onednn
url = https://github.com/oneapi-src/oneDNN
================================================
FILE: .licenserc.yaml
================================================
header:
license:
spdx-id: Apache-2.0
copyright-owner: Apache Software Foundation
paths-ignore:
- 'licenses'
- 'LICENSE'
- 'NOTICE'
- '3rdparty'
- 'DISCLAIMER'
- 'KEYS'
- 'tools/dependencies/LICENSE.binary.dependencies'
- 'tools/lint/git-clang-format-13'
# files not distributed in source archive (listed in tools/source-exclude-artifacts.txt)
- 'docs'
- 'CODEOWNERS'
- '.gitignore'
- '.codecov.yml'
- '.gitattributes'
- '.github'
- '.gitmodules'
- '.licenserc.yaml'
- '.asf.yaml'
- 'CODEOWNERS'
- 'python/mxnet/_cy3/README.md'
- 'tools/dependencies/LICENSE.binary.dependencies'
# files not distributed in source archive (listed in tools/source-exclude-artifacts.txt)
- 'docs'
# files licensed under apache-2.0 license but do not include full license headers recognized by skywalking-eyes
- '**/*.ipynb'
- 'src/operator/deformable_convolution-inl.h'
- 'src/operator/deformable_convolution.cc'
- 'src/operator/deformable_convolution.cu'
- 'src/operator/contrib/deformable_psroi_pooling-inl.h'
- 'src/operator/contrib/deformable_psroi_pooling.cc'
- 'src/operator/contrib/deformable_psroi_pooling.cu'
- 'src/operator/contrib/multi_proposal-inl.h'
- 'src/operator/contrib/multi_proposal.cc'
- 'src/operator/contrib/multi_proposal.cu'
- 'src/operator/contrib/psroi_pooling.cc'
- 'src/operator/contrib/psroi_pooling.cu'
- 'src/operator/nn/dnnl/dnnl_base-inl.h'
# files licensed under boost license
- 'cmake/Modules/FindJeMalloc.cmake'
# files licensed under bsd 2-clause + caffe
- 'src/operator/nn/pool.cuh'
- 'src/operator/nn/pool.h'
- 'src/operator/nn/im2col.cuh'
- 'src/operator/nn/im2col.h'
- 'src/operator/contrib/nn/deformable_im2col.cuh'
- 'src/operator/contrib/nn/deformable_im2col.h'
- 'src/operator/contrib/nn/modulated_deformable_im2col.cuh'
- 'src/operator/contrib/nn/modulated_deformable_im2col.h'
# files licensed under bsd 3-clause
- 'cmake/upstream/FindBLAS.cmake'
- 'cmake/upstream/FindCUDAToolkit.cmake'
- 'cmake/upstream/select_compute_arch.cmake'
- 'python/mxnet/onnx/mx2onnx/_export_onnx.py'
- 'python/mxnet/onnx/mx2onnx/_op_translations/_op_translations_opset12.py'
- 'python/mxnet/onnx/mx2onnx/_op_translations/_op_translations_opset13.py'
- 'src/operator/contrib/erfinv-inl.h'
- 'src/operator/numpy/np_einsum_op-inl.h'
- 'src/operator/numpy/np_einsum_op.cc'
- 'src/operator/numpy/np_einsum_path_op-inl.h'
# files licensed under mit license
- 'src/operator/modulated_deformable_convolution-inl.h'
- 'src/operator/modulated_deformable_convolution.cc'
- 'src/operator/modulated_deformable_convolution.cu'
- 'src/operator/nn/layer_norm_cpu.h'
# symlinks
- 'include/dlpack' # symlink to 3rdparty/dlpack/include/dlpack
- 'include/dmlc' # symlink to 3rdparty/dmlc-core/include/dmlc
- 'include/mshadow' # symlink to 3rdparty/mshadow/mshadow
- 'include/onednn' # symlinks to 3rdparty/onednn
- 'include/nnvm' # symlinks to 3rdparty/tvm/nnvm/include/nnvm
# test/build data
- 'tests/python/dnnl/data/test_dnnl_test_dnnl_model_model1.json'
comment: on-failure
================================================
FILE: .mxnet_root
================================================
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
# This file marks the root directory of the Apache MXNet repository.
================================================
FILE: 3rdparty/ctc_include/LICENSE
================================================
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "[]"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright [yyyy] [name of copyright owner]
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
----
Copyright 2015-2016, Baidu USA LLC.
================================================
FILE: 3rdparty/ctc_include/contrib/moderngpu/LICENSE
================================================
/******************************************************************************
* Copyright (c) 2013, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
================================================
FILE: 3rdparty/ctc_include/contrib/moderngpu/include/device/ctaloadbalance.cuh
================================================
/******************************************************************************
* Copyright (c) 2013, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
/******************************************************************************
*
* Code and text by Sean Baxter, NVIDIA Research
* See http://nvlabs.github.io/moderngpu for repository and documentation.
*
******************************************************************************/
#pragma once
#include "ctasearch.cuh"
#include "loadstore.cuh"
namespace mgpu {
////////////////////////////////////////////////////////////////////////////////
// DeviceLoadBalancingSearch
// Upper Bound search from A (needles) into B (haystack). The A values are
// natural numbers from aBegin to aEnd. bFirst is the index of the B value at
// bBegin in shared memory.
template
MGPU_DEVICE void DeviceSerialLoadBalanceSearch(const int* b_shared, int aBegin,
int aEnd, int bFirst, int bBegin, int bEnd, int* a_shared) {
int bKey = b_shared[bBegin];
#pragma unroll
for(int i = 0; i < VT; ++i) {
bool p;
if(RangeCheck)
p = (aBegin < aEnd) && ((bBegin >= bEnd) || (aBegin < bKey));
else
p = aBegin < bKey;
if(p)
// Advance A (the needle).
a_shared[aBegin++] = bFirst + bBegin;
else
// Advance B (the haystack).
bKey = b_shared[++bBegin];
}
}
////////////////////////////////////////////////////////////////////////////////
// CTALoadBalance
// Computes upper_bound(counting_iterator(first), b_global) - 1.
// Unlike most other CTA* functions, CTALoadBalance loads from global memory.
// This returns the loaded B elements at the beginning or end of shared memory
// depending on the aFirst argument.
// CTALoadBalance requires NT * VT + 2 slots of shared memory.
template
MGPU_DEVICE int4 CTALoadBalance(int destCount, InputIt b_global,
int sourceCount, int block, int tid, const int* mp_global,
int* indices_shared, bool loadPrecedingB) {
int4 range = ComputeMergeRange(destCount, sourceCount, block, 0, NT * VT,
mp_global);
int a0 = range.x;
int a1 = range.y;
int b0 = range.z;
int b1 = range.w;
if(!b0) loadPrecedingB = false;
// Load one trailing term from B. If we're already at the end, fill the
// end of the buffer with destCount.
int aCount = a1 - a0;
int bCount = b1 - b0;
int extended = b1 < sourceCount;
int loadCount = bCount + extended;
int fillCount = NT * VT + 1 - loadCount - aCount;
int* a_shared = indices_shared;
int* b_shared = indices_shared + aCount + (int)loadPrecedingB;
// Load the B values.
// DeviceMemToMemLoop(bCount + extended + (int)loadPrecedingB,
// b_global + b0 - (int)loadPrecedingB, tid,
// b_shared - (int)loadPrecedingB);
for(int i = tid - (int)loadPrecedingB; i < bCount + extended; i += NT)
b_shared[i] = b_global[b0 + i];
// Fill the end of the array with destCount.
for(int i = tid + extended; i < fillCount; i += NT)
b_shared[bCount + i] = destCount;
__syncthreads();
// Run a merge path to find the start of the serial merge for each thread.
int diag = VT * tid;
int mp = MergePath(mgpu::counting_iterator(a0),
aCount, b_shared, bCount, diag, mgpu::less());
int a0tid = a0 + mp;
int b0tid = diag - mp;
// Subtract 1 from b0 because we want to return upper_bound - 1.
DeviceSerialLoadBalanceSearch(b_shared, a0tid, a1, b0 - 1,
b0tid, bCount, a_shared - a0);
__syncthreads();
b0 -= (int)loadPrecedingB;
return make_int4(a0, a1, b0, b1);
}
} // namespace mgpu
================================================
FILE: 3rdparty/ctc_include/contrib/moderngpu/include/device/ctamerge.cuh
================================================
/******************************************************************************
* Copyright (c) 2013, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
/******************************************************************************
*
* Code and text by Sean Baxter, NVIDIA Research
* See http://nvlabs.github.io/moderngpu for repository and documentation.
*
******************************************************************************/
#pragma once
#include "ctasearch.cuh"
#include "loadstore.cuh"
#include "sortnetwork.cuh"
namespace mgpu {
////////////////////////////////////////////////////////////////////////////////
// SerialMerge
template
MGPU_DEVICE void SerialMerge(const T* keys_shared, int aBegin, int aEnd,
int bBegin, int bEnd, T* results, int* indices, Comp comp) {
T aKey = keys_shared[aBegin];
T bKey = keys_shared[bBegin];
#pragma unroll
for(int i = 0; i < VT; ++i) {
bool p;
if(RangeCheck)
p = (bBegin >= bEnd) || ((aBegin < aEnd) && !comp(bKey, aKey));
else
p = !comp(bKey, aKey);
results[i] = p ? aKey : bKey;
indices[i] = p ? aBegin : bBegin - !RangeCheck;
if(p) aKey = keys_shared[++aBegin];
else bKey = keys_shared[++bBegin];
}
__syncthreads();
}
////////////////////////////////////////////////////////////////////////////////
// FindMergeFrame and FindMergesortInterval help mergesort (both CTA and global
// merge pass levels) locate lists within the single source array.
// Returns (offset of a, offset of b, length of list).
MGPU_HOST_DEVICE int3 FindMergesortFrame(int coop, int block, int nv) {
// coop is the number of CTAs or threads cooperating to merge two lists into
// one. We round block down to the first CTA's ID that is working on this
// merge.
int start = ~(coop - 1) & block;
int size = nv * (coop>> 1);
return make_int3(nv * start, nv * start + size, size);
}
// Returns (a0, a1, b0, b1) into mergesort input lists between mp0 and mp1.
MGPU_HOST_DEVICE int4 FindMergesortInterval(int3 frame, int coop, int block,
int nv, int count, int mp0, int mp1) {
// Locate diag from the start of the A sublist.
int diag = nv * block - frame.x;
int a0 = frame.x + mp0;
int a1 = min(count, frame.x + mp1);
int b0 = min(count, frame.y + diag - mp0);
int b1 = min(count, frame.y + diag + nv - mp1);
// The end partition of the last block for each merge operation is computed
// and stored as the begin partition for the subsequent merge. i.e. it is
// the same partition but in the wrong coordinate system, so its 0 when it
// should be listSize. Correct that by checking if this is the last block
// in this merge operation.
if(coop - 1 == ((coop - 1) & block)) {
a1 = min(count, frame.x + frame.z);
b1 = min(count, frame.y + frame.z);
}
return make_int4(a0, a1, b0, b1);
}
////////////////////////////////////////////////////////////////////////////////
// ComputeMergeRange
MGPU_HOST_DEVICE int4 ComputeMergeRange(int aCount, int bCount, int block,
int coop, int NV, const int* mp_global) {
// Load the merge paths computed by the partitioning kernel.
int mp0 = mp_global[block];
int mp1 = mp_global[block + 1];
int gid = NV * block;
// Compute the ranges of the sources in global memory.
int4 range;
if(coop) {
int3 frame = FindMergesortFrame(coop, block, NV);
range = FindMergesortInterval(frame, coop, block, NV, aCount, mp0,
mp1);
} else {
range.x = mp0; // a0
range.y = mp1; // a1
range.z = gid - range.x; // b0
range.w = min(aCount + bCount, gid + NV) - range.y; // b1
}
return range;
}
////////////////////////////////////////////////////////////////////////////////
// CTA mergesort support
template
MGPU_DEVICE void CTABlocksortPass(T* keys_shared, int tid, int count,
int coop, T* keys, int* indices, Comp comp) {
int list = ~(coop - 1) & tid;
int diag = min(count, VT * ((coop - 1) & tid));
int start = VT * list;
int a0 = min(count, start);
int b0 = min(count, start + VT * (coop / 2));
int b1 = min(count, start + VT * coop);
int p = MergePath(keys_shared + a0, b0 - a0,
keys_shared + b0, b1 - b0, diag, comp);
SerialMerge(keys_shared, a0 + p, b0, b0 + diag - p, b1, keys,
indices, comp);
}
template
MGPU_DEVICE void CTABlocksortLoop(ValType threadValues[VT],
KeyType* keys_shared, ValType* values_shared, int tid, int count,
Comp comp) {
#pragma unroll
for(int coop = 2; coop <= NT; coop *= 2) {
int indices[VT];
KeyType keys[VT];
CTABlocksortPass(keys_shared, tid, count, coop, keys,
indices, comp);
if(HasValues) {
// Exchange the values through shared memory.
DeviceThreadToShared(threadValues, tid, values_shared);
DeviceGather(NT * VT, values_shared, indices, tid,
threadValues);
}
// Store results in shared memory in sorted order.
DeviceThreadToShared(keys, tid, keys_shared);
}
}
////////////////////////////////////////////////////////////////////////////////
// CTAMergesort
// Caller provides the keys in shared memory. This functions sorts the first
// count elements.
template
MGPU_DEVICE void CTAMergesort(KeyType threadKeys[VT], ValType threadValues[VT],
KeyType* keys_shared, ValType* values_shared, int count, int tid,
Comp comp) {
// Stable sort the keys in the thread.
if(VT * tid < count) {
if(Stable)
OddEvenTransposeSort(threadKeys, threadValues, comp);
else
OddEvenMergesort(threadKeys, threadValues, comp);
}
// Store the locally sorted keys into shared memory.
DeviceThreadToShared(threadKeys, tid, keys_shared);
// Recursively merge lists until the entire CTA is sorted.
CTABlocksortLoop(threadValues, keys_shared,
values_shared, tid, count, comp);
}
template
MGPU_DEVICE void CTAMergesortKeys(KeyType threadKeys[VT],
KeyType* keys_shared, int count, int tid, Comp comp) {
int valuesTemp[VT];
CTAMergesort(threadKeys, valuesTemp, keys_shared,
(int*)keys_shared, count, tid, comp);
}
template
MGPU_DEVICE void CTAMergesortPairs(KeyType threadKeys[VT],
ValType threadValues[VT], KeyType* keys_shared, ValType* values_shared,
int count, int tid, Comp comp) {
CTAMergesort(threadKeys, threadValues, keys_shared,
values_shared, count, tid, comp);
}
////////////////////////////////////////////////////////////////////////////////
// DeviceMergeKeysIndices
template
MGPU_DEVICE void DeviceMergeKeysIndices(It1 a_global, int aCount, It2 b_global,
int bCount, int4 range, int tid, T* keys_shared, T* results, int* indices,
Comp comp) {
int a0 = range.x;
int a1 = range.y;
int b0 = range.z;
int b1 = range.w;
if(LoadExtended) {
bool extended = (a1 < aCount) && (b1 < bCount);
aCount = a1 - a0;
bCount = b1 - b0;
int aCount2 = aCount + (int)extended;
int bCount2 = bCount + (int)extended;
// Load one element past the end of each input to avoid having to use
// range checking in the merge loop.
DeviceLoad2ToShared(a_global + a0, aCount2,
b_global + b0, bCount2, tid, keys_shared);
// Run a Merge Path search for each thread's starting point.
int diag = VT * tid;
int mp = MergePath(keys_shared, aCount,
keys_shared + aCount2, bCount, diag, comp);
// Compute the ranges of the sources in shared memory.
int a0tid = mp;
int b0tid = aCount2 + diag - mp;
if(extended) {
SerialMerge(keys_shared, a0tid, 0, b0tid, 0, results,
indices, comp);
} else {
int a1tid = aCount;
int b1tid = aCount2 + bCount;
SerialMerge(keys_shared, a0tid, a1tid, b0tid, b1tid,
results, indices, comp);
}
} else {
// Use the input intervals from the ranges between the merge path
// intersections.
aCount = a1 - a0;
bCount = b1 - b0;
// Load the data into shared memory.
DeviceLoad2ToShared(a_global + a0, aCount, b_global + b0,
bCount, tid, keys_shared);
// Run a merge path to find the start of the serial merge for each
// thread.
int diag = VT * tid;
int mp = MergePath(keys_shared, aCount,
keys_shared + aCount, bCount, diag, comp);
// Compute the ranges of the sources in shared memory.
int a0tid = mp;
int a1tid = aCount;
int b0tid = aCount + diag - mp;
int b1tid = aCount + bCount;
// Serial merge into register.
SerialMerge(keys_shared, a0tid, a1tid, b0tid, b1tid, results,
indices, comp);
}
}
////////////////////////////////////////////////////////////////////////////////
// DeviceMerge
// Merge pairs from global memory into global memory. Useful factorization to
// enable calling from merge, mergesort, and locality sort.
template
MGPU_DEVICE void DeviceMerge(KeysIt1 aKeys_global, ValsIt1 aVals_global,
int aCount, KeysIt2 bKeys_global, ValsIt2 bVals_global, int bCount,
int tid, int block, int4 range, KeyType* keys_shared, int* indices_shared,
KeysIt3 keys_global, ValsIt3 vals_global, Comp comp) {
KeyType results[VT];
int indices[VT];
DeviceMergeKeysIndices(aKeys_global, aCount,
bKeys_global, bCount, range, tid, keys_shared, results, indices, comp);
// Store merge results back to shared memory.
DeviceThreadToShared(results, tid, keys_shared);
// Store merged keys to global memory.
aCount = range.y - range.x;
bCount = range.w - range.z;
DeviceSharedToGlobal(aCount + bCount, keys_shared, tid,
keys_global + NT * VT * block);
// Copy the values.
if(HasValues) {
DeviceThreadToShared(indices, tid, indices_shared);
DeviceTransferMergeValuesShared(aCount + bCount,
aVals_global + range.x, bVals_global + range.z, aCount,
indices_shared, tid, vals_global + NT * VT * block);
}
}
} // namespace mgpu
================================================
FILE: 3rdparty/ctc_include/contrib/moderngpu/include/device/ctascan.cuh
================================================
/******************************************************************************
* Copyright (c) 2013, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
/******************************************************************************
*
* Code and text by Sean Baxter, NVIDIA Research
* See http://nvlabs.github.io/moderngpu for repository and documentation.
*
******************************************************************************/
#pragma once
#include "../mgpuenums.h"
#include "deviceutil.cuh"
#include "intrinsics.cuh"
namespace mgpu {
////////////////////////////////////////////////////////////////////////////////
// CTAReduce
template >
struct CTAReduce {
typedef typename Op::first_argument_type T;
enum { Size = NT, Capacity = NT };
struct Storage { T shared[Capacity]; };
MGPU_DEVICE static T Reduce(int tid, T x, Storage& storage, Op op = Op()) {
storage.shared[tid] = x;
__syncthreads();
// Fold the data in half with each pass.
#pragma unroll
for(int destCount = NT / 2; destCount >= 1; destCount /= 2) {
if(tid < destCount) {
// Read from the right half and store to the left half.
x = op(x, storage.shared[destCount + tid]);
storage.shared[tid] = x;
}
__syncthreads();
}
T total = storage.shared[0];
__syncthreads();
return total;
}
};
#if __CUDA_ARCH__ >= 300
template
struct CTAReduce > {
typedef mgpu::plus Op;
typedef int T;
enum { Size = NT, Capacity = WARP_SIZE };
struct Storage { int shared[Capacity]; };
MGPU_DEVICE static int Reduce(int tid, int x, Storage& storage,
Op op = Op()) {
const int NumSections = WARP_SIZE;
const int SecSize = NT / NumSections;
int lane = (SecSize - 1) & tid;
int sec = tid / SecSize;
// In the first phase, threads cooperatively find the reduction within
// their segment. The segments are SecSize threads (NT / WARP_SIZE)
// wide.
#pragma unroll
for(int offset = 1; offset < SecSize; offset *= 2)
x = shfl_add(x, offset, SecSize);
// The last thread in each segment stores the local reduction to shared
// memory.
if(SecSize - 1 == lane) storage.shared[sec] = x;
__syncthreads();
// Reduce the totals of each input segment. The spine is WARP_SIZE
// threads wide.
if(tid < NumSections) {
x = storage.shared[tid];
#pragma unroll
for(int offset = 1; offset < NumSections; offset *= 2)
x = shfl_add(x, offset, NumSections);
storage.shared[tid] = x;
}
__syncthreads();
int reduction = storage.shared[NumSections - 1];
__syncthreads();
return reduction;
}
};
template
struct CTAReduce > {
typedef mgpu::maximum Op;
enum { Size = NT, Capacity = WARP_SIZE };
struct Storage { int shared[Capacity]; };
MGPU_DEVICE static int Reduce(int tid, int x, Storage& storage,
Op op = Op()) {
const int NumSections = WARP_SIZE;
const int SecSize = NT / NumSections;
int lane = (SecSize - 1) & tid;
int sec = tid / SecSize;
#pragma unroll
for(int offset = 1; offset < SecSize; offset *= 2)
x = shfl_max(x, offset, SecSize);
if(SecSize - 1 == lane) storage.shared[sec] = x;
__syncthreads();
if(tid < NumSections) {
x = storage.shared[tid];
#pragma unroll
for(int offset = 1; offset < NumSections; offset *= 2)
x = shfl_max(x, offset, NumSections);
storage.shared[tid] = x;
}
__syncthreads();
int reduction = storage.shared[NumSections - 1];
__syncthreads();
return reduction;
}
};
#endif // __CUDA_ARCH__ >= 300
////////////////////////////////////////////////////////////////////////////////
// CTAScan
template >
struct CTAScan {
typedef typename Op::result_type T;
enum { Size = NT, Capacity = 2 * NT + 1 };
struct Storage { T shared[Capacity]; };
MGPU_DEVICE static T Scan(int tid, T x, Storage& storage, T* total,
MgpuScanType type = MgpuScanTypeExc, T identity = (T)0, Op op = Op()) {
storage.shared[tid] = x;
int first = 0;
__syncthreads();
#pragma unroll
for(int offset = 1; offset < NT; offset += offset) {
if(tid >= offset)
x = op(storage.shared[first + tid - offset], x);
first = NT - first;
storage.shared[first + tid] = x;
__syncthreads();
}
*total = storage.shared[first + NT - 1];
if(MgpuScanTypeExc == type)
x = tid ? storage.shared[first + tid - 1] : identity;
__syncthreads();
return x;
}
MGPU_DEVICE static T Scan(int tid, T x, Storage& storage) {
T total;
return Scan(tid, x, storage, &total, MgpuScanTypeExc, (T)0, Op());
}
};
////////////////////////////////////////////////////////////////////////////////
// Special partial specialization for CTAScan on Kepler.
// This uses the shfl intrinsic to reduce scan latency.
#if __CUDA_ARCH__ >= 300
template
struct CTAScan > {
typedef mgpu::plus Op;
enum { Size = NT, NumSegments = WARP_SIZE, SegSize = NT / NumSegments };
enum { Capacity = NumSegments + 1 };
struct Storage { int shared[Capacity + 1]; };
MGPU_DEVICE static int Scan(int tid, int x, Storage& storage, int* total,
MgpuScanType type = MgpuScanTypeExc, int identity = 0, Op op = Op()) {
// Define WARP_SIZE segments that are NT / WARP_SIZE large.
// Each warp makes log(SegSize) shfl_add calls.
// The spine makes log(WARP_SIZE) shfl_add calls.
int lane = (SegSize - 1) & tid;
int segment = tid / SegSize;
// Scan each segment using shfl_add.
int scan = x;
#pragma unroll
for(int offset = 1; offset < SegSize; offset *= 2)
scan = shfl_add(scan, offset, SegSize);
// Store the reduction (last element) of each segment into storage.
if(SegSize - 1 == lane) storage.shared[segment] = scan;
__syncthreads();
// Warp 0 does a full shfl warp scan on the partials. The total is
// stored to shared[NumSegments]. (NumSegments = WARP_SIZE)
if(tid < NumSegments) {
int y = storage.shared[tid];
int scan = y;
#pragma unroll
for(int offset = 1; offset < NumSegments; offset *= 2)
scan = shfl_add(scan, offset, NumSegments);
storage.shared[tid] = scan - y;
if(NumSegments - 1 == tid) storage.shared[NumSegments] = scan;
}
__syncthreads();
// Add the scanned partials back in and convert to exclusive scan.
scan += storage.shared[segment];
if(MgpuScanTypeExc == type) {
scan -= x;
if(identity && !tid) scan = identity;
}
*total = storage.shared[NumSegments];
__syncthreads();
return scan;
}
MGPU_DEVICE static int Scan(int tid, int x, Storage& storage) {
int total;
return Scan(tid, x, storage, &total, MgpuScanTypeExc, 0);
}
};
#endif // __CUDA_ARCH__ >= 300
////////////////////////////////////////////////////////////////////////////////
// CTABinaryScan
template
MGPU_DEVICE int CTABinaryScan(int tid, bool x, int* shared, int* total) {
const int NumWarps = NT / WARP_SIZE;
int warp = tid / WARP_SIZE;
int lane = (WARP_SIZE - 1);
// Store the bit totals for each warp.
uint bits = __ballot(x);
shared[warp] = popc(bits);
__syncthreads();
#if __CUDA_ARCH__ >= 300
if(tid < NumWarps) {
int x = shared[tid];
int scan = x;
#pragma unroll
for(int offset = 1; offset < NumWarps; offset *= 2)
scan = shfl_add(scan, offset, NumWarps);
shared[tid] = scan - x;
}
__syncthreads();
#else
// Thread 0 scans warp totals.
if(!tid) {
int scan = 0;
#pragma unroll
for(int i = 0; i < NumWarps; ++i) {
int y = shared[i];
shared[i] = scan;
scan += y;
}
shared[NumWarps] = scan;
}
__syncthreads();
#endif // __CUDA_ARCH__ >= 300
// Add the warp scan back into the partials.
int scan = shared[warp] + __popc(bfe(bits, 0, lane));
*total = shared[NumWarps];
__syncthreads();
return scan;
}
} // namespace mgpu
================================================
FILE: 3rdparty/ctc_include/contrib/moderngpu/include/device/ctasearch.cuh
================================================
/******************************************************************************
* Copyright (c) 2013, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
/******************************************************************************
*
* Code and text by Sean Baxter, NVIDIA Research
* See http://nvlabs.github.io/moderngpu for repository and documentation.
*
******************************************************************************/
#pragma once
#include "deviceutil.cuh"
#include "../mgpudevice.cuh"
namespace mgpu {
template
MGPU_HOST_DEVICE void BinarySearchIt(It data, int& begin, int& end, T key,
int shift, Comp comp) {
IntT scale = (1<< shift) - 1;
int mid = (int)((begin + scale * end)>> shift);
T key2 = data[mid];
bool pred = (MgpuBoundsUpper == Bounds) ?
!comp(key, key2) :
comp(key2, key);
if(pred) begin = mid + 1;
else end = mid;
}
template
MGPU_HOST_DEVICE int BiasedBinarySearch(It data, int count, T key, int levels,
Comp comp) {
int begin = 0;
int end = count;
if(levels >= 4 && begin < end)
BinarySearchIt(data, begin, end, key, 9, comp);
if(levels >= 3 && begin < end)
BinarySearchIt(data, begin, end, key, 7, comp);
if(levels >= 2 && begin < end)
BinarySearchIt(data, begin, end, key, 5, comp);
if(levels >= 1 && begin < end)
BinarySearchIt(data, begin, end, key, 4, comp);
while(begin < end)
BinarySearchIt(data, begin, end, key, 1, comp);
return begin;
}
template
MGPU_HOST_DEVICE int BinarySearch(It data, int count, T key, Comp comp) {
int begin = 0;
int end = count;
while(begin < end)
BinarySearchIt(data, begin, end, key, 1, comp);
return begin;
}
////////////////////////////////////////////////////////////////////////////////
// MergePath search
template
MGPU_HOST_DEVICE int MergePath(It1 a, int aCount, It2 b, int bCount, int diag,
Comp comp) {
typedef typename std::iterator_traits::value_type T;
int begin = max(0, diag - bCount);
int end = min(diag, aCount);
while(begin < end) {
int mid = (begin + end)>> 1;
T aKey = a[mid];
T bKey = b[diag - 1 - mid];
bool pred = (MgpuBoundsUpper == Bounds) ?
comp(aKey, bKey) :
!comp(bKey, aKey);
if(pred) begin = mid + 1;
else end = mid;
}
return begin;
}
////////////////////////////////////////////////////////////////////////////////
// SegmentedMergePath search
template
MGPU_HOST_DEVICE int SegmentedMergePath(InputIt keys, int aOffset, int aCount,
int bOffset, int bCount, int leftEnd, int rightStart, int diag, Comp comp) {
// leftEnd and rightStart are defined from the origin, and diag is defined
// from aOffset.
// We only need to run a Merge Path search if the diagonal intersects the
// segment that strides the left and right halves (i.e. is between leftEnd
// and rightStart).
if(aOffset + diag <= leftEnd) return diag;
if(aOffset + diag >= rightStart) return aCount;
bCount = min(bCount, rightStart - bOffset);
int begin = max(max(leftEnd - aOffset, 0), diag - bCount);
int end = min(diag, aCount);
while(begin < end) {
int mid = (begin + end)>> 1;
int ai = aOffset + mid;
int bi = bOffset + diag - 1 - mid;
bool pred = !comp(keys[bi], keys[ai]);
if(pred) begin = mid + 1;
else end = mid;
}
return begin;
}
////////////////////////////////////////////////////////////////////////////////
// BalancedPath search
template
MGPU_HOST_DEVICE int2 BalancedPath(InputIt1 a, int aCount, InputIt2 b,
int bCount, int diag, int levels, Comp comp) {
typedef typename std::iterator_traits::value_type T;
int p = MergePath(a, aCount, b, bCount, diag, comp);
int aIndex = p;
int bIndex = diag - p;
bool star = false;
if(bIndex < bCount) {
if(Duplicates) {
T x = b[bIndex];
// Search for the beginning of the duplicate run in both A and B.
// Because
int aStart = BiasedBinarySearch(a, aIndex, x,
levels, comp);
int bStart = BiasedBinarySearch(b, bIndex, x,
levels, comp);
// The distance between the merge path and the lower_bound is the
// 'run'. We add up the a- and b- runs and evenly distribute them to
// get a stairstep path.
int aRun = aIndex - aStart;
int bRun = bIndex - bStart;
int xCount = aRun + bRun;
// Attempt to advance b and regress a.
int bAdvance = max(xCount>> 1, bRun);
int bEnd = min(bCount, bStart + bAdvance + 1);
int bRunEnd = BinarySearch(b + bIndex,
bEnd - bIndex, x, comp) + bIndex;
bRun = bRunEnd - bStart;
bAdvance = min(bAdvance, bRun);
int aAdvance = xCount - bAdvance;
bool roundUp = (aAdvance == bAdvance + 1) && (bAdvance < bRun);
aIndex = aStart + aAdvance;
if(roundUp) star = true;
} else {
if(aIndex && aCount) {
T aKey = a[aIndex - 1];
T bKey = b[bIndex];
// If the last consumed element in A (aIndex - 1) is the same as
// the next element in B (bIndex), we're sitting at a starred
// partition.
if(!comp(aKey, bKey)) star = true;
}
}
}
return make_int2(aIndex, star);
}
} // namespace mgpu
================================================
FILE: 3rdparty/ctc_include/contrib/moderngpu/include/device/ctasegreduce.cuh
================================================
/******************************************************************************
* Copyright (c) 2013, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
/******************************************************************************
*
* Code and text by Sean Baxter, NVIDIA Research
* See http://nvlabs.github.io/moderngpu for repository and documentation.
*
******************************************************************************/
#pragma once
#include "ctasegscan.cuh"
#include "ctasearch.cuh"
namespace mgpu {
////////////////////////////////////////////////////////////////////////////////
// Segmented reduce utility functions.
// Extract the upper-bound indices from the coded ranges. Decrement to include
// the first addressed row/segment.
struct SegReduceRange {
int begin;
int end;
int total;
bool flushLast;
};
MGPU_DEVICE SegReduceRange DeviceShiftRange(int limit0, int limit1) {
SegReduceRange range;
range.begin = 0x7fffffff & limit0;
range.end = 0x7fffffff & limit1;
range.total = range.end - range.begin;
range.flushLast = 0 == (0x80000000 & limit1);
range.end += !range.flushLast;
return range;
}
// Reconstitute row/segment indices from a starting row index and packed end
// flags. Used for pre-processed versions of interval reduce and interval Spmv.
template
MGPU_DEVICE void DeviceExpandFlagsToRows(int first, int endFlags,
int rows[VT + 1]) {
rows[0] = first;
#pragma unroll
for(int i = 0; i < VT; ++i) {
if((1<< i) & endFlags) ++first;
rows[i + 1] = first;
}
}
////////////////////////////////////////////////////////////////////////////////
// After loading CSR terms into shared memory, each thread binary searches
// (upper-bound) to find its starting point. Each thread then walks forward,
// emitting the csr0-relative row indices to register.
template
MGPU_DEVICE int DeviceExpandCsrRows(int tidOffset, int* csr_shared,
int numRows, int end, int rows[VT + 1], int rowStarts[VT]) {
// Each thread binary searches for its starting row.
int row = BinarySearch(csr_shared, numRows, tidOffset,
mgpu::less()) - 1;
// Each thread starts at row and scans forward, emitting row IDs into
// register. Store the CTA-local row index (starts at 0) to rows and the
// start of the row (globally) to rowStarts.
int curOffset = csr_shared[row];
int nextOffset = (row + 1 < numRows) ? csr_shared[row + 1] : end;
rows[0] = row;
rowStarts[0] = curOffset;
int endFlags = 0;
#pragma unroll
for(int i = 1; i <= VT; ++i) {
// Advance the row cursor when the iterator hits the next row offset.
if(tidOffset + i == nextOffset) {
// Set an end flag when the cursor advances to the next row.
endFlags |= 1<< (i - 1);
// Advance the cursor and load the next row offset.
++row;
curOffset = nextOffset;
nextOffset = (row + 1 < numRows) ? csr_shared[row + 1] : end;
}
rows[i] = row;
if(i < VT) rowStarts[i] = curOffset;
}
__syncthreads();
return endFlags;
}
////////////////////////////////////////////////////////////////////////////////
// DeviceSegReducePrepare
// Expand non-empty interval of CSR elements into row indices. Compute end-flags
// by comparing adjacent row IDs.
// DeviceSegReducePrepare may be called either by a pre-processing kernel or by
// the kernel that actually evaluates the segmented reduction if no preprocesing
// is desired.
struct SegReduceTerms {
int endFlags;
int tidDelta;
};
template
MGPU_DEVICE SegReduceTerms DeviceSegReducePrepare(int* csr_shared, int numRows,
int tid, int gid, bool flushLast, int rows[VT + 1], int rowStarts[VT]) {
// Pass a sentinel (end) to point to the next segment start. If we flush,
// this is the end of this tile. Otherwise it is INT_MAX
int endFlags = DeviceExpandCsrRows(gid + VT * tid, csr_shared,
numRows, flushLast ? (gid + NT * VT) : INT_MAX, rows, rowStarts);
// Find the distance to to scan to compute carry-in for each thread. Use the
// existance of an end flag anywhere in the thread to determine if carry-out
// values from the left should propagate through to the right.
int tidDelta = DeviceFindSegScanDelta(tid, rows[0] != rows[VT],
csr_shared);
SegReduceTerms terms = { endFlags, tidDelta };
return terms;
}
////////////////////////////////////////////////////////////////////////////////
// CTASegReduce
// Core segmented reduction code. Supports fast-path and slow-path for intra-CTA
// segmented reduction. Stores partials to global memory.
// Callers feed CTASegReduce::ReduceToGlobal values in thread order.
template
struct CTASegReduce {
typedef CTASegScan SegScan;
enum {
NV = NT * VT,
Capacity = HalfCapacity ? (NV / 2) : NV
};
union Storage {
typename SegScan::Storage segScanStorage;
T values[Capacity];
};
template
MGPU_DEVICE static void ReduceToGlobal(const int rows[VT + 1], int total,
int tidDelta, int startRow, int block, int tid, T data[VT],
DestIt dest_global, T* carryOut_global, T identity, Op op,
Storage& storage) {
// Run a segmented scan within the thread.
T x, localScan[VT];
#pragma unroll
for(int i = 0; i < VT; ++i) {
x = i ? op(x, data[i]) : data[i];
localScan[i] = x;
if(rows[i] != rows[i + 1]) x = identity;
}
// Run a parallel segmented scan over the carry-out values to compute
// carry-in.
T carryOut;
T carryIn = SegScan::SegScanDelta(tid, tidDelta, x,
storage.segScanStorage, &carryOut, identity, op);
// Store the carry-out for the entire CTA to global memory.
if(!tid) carryOut_global[block] = carryOut;
dest_global += startRow;
if(HalfCapacity && total > Capacity) {
// Add carry-in to each thread-local scan value. Store directly
// to global.
#pragma unroll
for(int i = 0; i < VT; ++i) {
// Add the carry-in to the local scan.
T x2 = op(carryIn, localScan[i]);
// Store on the end flag and clear the carry-in.
if(rows[i] != rows[i + 1]) {
carryIn = identity;
dest_global[rows[i]] = x2;
}
}
} else {
// All partials fit in shared memory. Add carry-in to each thread-
// local scan value.
#pragma unroll
for(int i = 0; i < VT; ++i) {
// Add the carry-in to the local scan.
T x2 = op(carryIn, localScan[i]);
// Store reduction when the segment changes and clear the
// carry-in.
if(rows[i] != rows[i + 1]) {
storage.values[rows[i]] = x2;
carryIn = identity;
}
}
__syncthreads();
// Cooperatively store reductions to global memory.
for(int index = tid; index < total; index += NT)
dest_global[index] = storage.values[index];
__syncthreads();
}
}
};
} // namespace mgpu
================================================
FILE: 3rdparty/ctc_include/contrib/moderngpu/include/device/ctasegscan.cuh
================================================
/******************************************************************************
* Copyright (c) 2013, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
/******************************************************************************
*
* Code and text by Sean Baxter, NVIDIA Research
* See http://nvlabs.github.io/moderngpu for repository and documentation.
*
******************************************************************************/
#pragma once
#include "ctascan.cuh"
namespace mgpu {
////////////////////////////////////////////////////////////////////////////////
// DeviceFindSegScanDelta
// Runs an inclusive max-index scan over binary inputs.
template
MGPU_DEVICE int DeviceFindSegScanDelta(int tid, bool flag, int* delta_shared) {
const int NumWarps = NT / 32;
int warp = tid / 32;
int lane = 31 & tid;
uint warpMask = 0xffffffff>> (31 - lane); // inclusive search
uint ctaMask = 0x7fffffff>> (31 - lane); // exclusive search
uint warpBits = __ballot(flag);
delta_shared[warp] = warpBits;
__syncthreads();
if(tid < NumWarps) {
uint ctaBits = __ballot(0 != delta_shared[tid]);
int warpSegment = 31 - clz(ctaMask & ctaBits);
int start = (-1 != warpSegment) ?
(31 - clz(delta_shared[warpSegment]) + 32 * warpSegment) : 0;
delta_shared[NumWarps + tid] = start;
}
__syncthreads();
// Find the closest flag to the left of this thread within the warp.
// Include the flag for this thread.
int start = 31 - clz(warpMask & warpBits);
if(-1 != start) start += ~31 & tid;
else start = delta_shared[NumWarps + warp];
__syncthreads();
return tid - start;
}
////////////////////////////////////////////////////////////////////////////////
// CTASegScan
template >
struct CTASegScan {
typedef _Op Op;
typedef typename Op::result_type T;
enum { NumWarps = NT / 32, Size = NT, Capacity = 2 * NT };
union Storage {
int delta[NumWarps];
T values[Capacity];
};
// Each thread passes the reduction of the LAST SEGMENT that it covers.
// flag is set to true if there's at least one segment flag in the thread.
// SegScan returns the reduction of values for the first segment in this
// thread over the preceding threads.
// Return the value init for the first thread.
// When scanning single elements per thread, interpret the flag as a BEGIN
// FLAG. If tid's flag is set, its value belongs to thread tid + 1, not
// thread tid.
// The function returns the reduction of the last segment in the CTA.
MGPU_DEVICE static T SegScanDelta(int tid, int tidDelta, T x,
Storage& storage, T* carryOut, T identity = (T)0, Op op = Op()) {
// Run an inclusive scan
int first = 0;
storage.values[first + tid] = x;
__syncthreads();
#pragma unroll
for(int offset = 1; offset < NT; offset += offset) {
if(tidDelta >= offset)
x = op(storage.values[first + tid - offset], x);
first = NT - first;
storage.values[first + tid] = x;
__syncthreads();
}
// Get the exclusive scan.
x = tid ? storage.values[first + tid - 1] : identity;
*carryOut = storage.values[first + NT - 1];
__syncthreads();
return x;
}
MGPU_DEVICE static T SegScan(int tid, T x, bool flag, Storage& storage,
T* carryOut, T identity = (T)0, Op op = Op()) {
// Find the left-most thread that covers the first segment of this
// thread.
int tidDelta = DeviceFindSegScanDelta(tid, flag, storage.delta);
return SegScanDelta(tid, tidDelta, x, storage, carryOut, identity, op);
}
};
} // namespace mgpu
================================================
FILE: 3rdparty/ctc_include/contrib/moderngpu/include/device/ctasegsort.cuh
================================================
/******************************************************************************
* Copyright (c) 2013, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
/******************************************************************************
*
* Code and text by Sean Baxter, NVIDIA Research
* See http://nvlabs.github.io/moderngpu for repository and documentation.
*
******************************************************************************/
#pragma once
#include "ctascan.cuh"
#include "ctasearch.cuh"
#include "loadstore.cuh"
#include "sortnetwork.cuh"
namespace mgpu {
template
MGPU_DEVICE void SegmentedSerialMerge(const T* keys_shared, int aBegin,
int aEnd, int bBegin, int bEnd, T results[VT], int indices[VT],
int leftEnd, int rightStart, Comp comp, bool sync = true) {
bEnd = min(rightStart, bEnd);
T aKey = keys_shared[aBegin];
T bKey = keys_shared[bBegin];
#pragma unroll
for(int i = 0; i < VT; ++i) {
bool p;
// If A has run out of inputs, emit B.
if(aBegin >= aEnd)
p = false;
else if(bBegin >= bEnd || aBegin < leftEnd)
// B has hit the end of the middle segment.
// Emit A if A has inputs remaining in the middle segment.
p = true;
else
// Emit the smaller element in the middle segment.
p = !comp(bKey, aKey);
results[i] = p ? aKey : bKey;
indices[i] = p ? aBegin : bBegin;
if(p) aKey = keys_shared[++aBegin];
else bKey = keys_shared[++bBegin];
}
if(sync) { __syncthreads(); }
}
////////////////////////////////////////////////////////////////////////////////
// CTASegsortPass
template
MGPU_DEVICE void CTASegsortPass(T* keys_shared, int* ranges_shared, int tid,
int pass, T results[VT], int indices[VT], int2& activeRange, Comp comp) {
// Locate the intervals of the input lists.
int3 frame = FindMergesortFrame(2<< pass, tid, VT);
int a0 = frame.x;
int b0 = frame.y;
int listLen = frame.z;
int list = tid>> pass;
int listParity = 1 & list;
int diag = VT * tid - frame.x;
// Fetch the active range for the list this thread's list is merging with.
int siblingRange = ranges_shared[1 ^ list];
int siblingStart = 0x0000ffff & siblingRange;
int siblingEnd = siblingRange>> 16;
// Create a new active range for the merge.
int leftEnd = listParity ? siblingEnd : activeRange.y;
int rightStart = listParity ? activeRange.x : siblingStart;
activeRange.x = min(activeRange.x, siblingStart);
activeRange.y = max(activeRange.y, siblingEnd);
int p = SegmentedMergePath(keys_shared, a0, listLen, b0, listLen, leftEnd,
rightStart, diag, comp);
int a0tid = a0 + p;
int b0tid = b0 + diag - p;
SegmentedSerialMerge(keys_shared, a0tid, b0, b0tid, b0 + listLen,
results, indices, leftEnd, rightStart, comp);
// Store the ranges to shared memory.
if(0 == diag)
ranges_shared[list>> 1] =
(int)bfi(activeRange.y, activeRange.x, 16, 16);
}
////////////////////////////////////////////////////////////////////////////////
// CTASegsortLoop
template
MGPU_DEVICE int2 CTASegsortLoop(KeyType threadKeys[VT],
ValType threadValues[VT], KeyType* keys_shared, ValType* values_shared,
int* ranges_shared, int tid, int2 activeRange, Comp comp) {
const int NumPasses = sLogPow2::value;
#pragma unroll
for(int pass = 0; pass < NumPasses; ++pass) {
int indices[VT];
CTASegsortPass(keys_shared, ranges_shared, tid, pass,
threadKeys, indices, activeRange, comp);
if(HasValues) {
// Exchange values through shared memory.
DeviceThreadToShared(threadValues, tid, values_shared);
DeviceGather(NT * VT, values_shared, indices, tid,
threadValues);
}
// Store results in shared memory in sorted order.
DeviceThreadToShared(threadKeys, tid, keys_shared);
}
return activeRange;
}
////////////////////////////////////////////////////////////////////////////////
// CTASegsort
// Pass keys and values in register. On return, values are returned in register
// and keys returned in shared memory.
template
MGPU_DEVICE int2 CTASegsort(KeyType threadKeys[VT], ValType threadValues[VT],
int tid, int headFlags, KeyType* keys_shared, ValType* values_shared,
int* ranges_shared, Comp comp) {
if(Stable)
// Odd-even transpose sort.
OddEvenTransposeSortFlags(threadKeys, threadValues, headFlags,
comp);
else
// Batcher's odd-even mergesort.
OddEvenMergesortFlags(threadKeys, threadValues, headFlags, comp);
// Record the first and last occurrence of head flags in this segment.
int blockEnd = 31 - clz(headFlags);
if(-1 != blockEnd) blockEnd += VT * tid;
int blockStart = ffs(headFlags);
blockStart = blockStart ? (VT * tid - 1 + blockStart) : (NT * VT);
ranges_shared[tid] = (int)bfi(blockEnd, blockStart, 16, 16);
// Store back to shared mem. The values are in VT-length sorted lists.
// These are merged recursively.
DeviceThreadToShared(threadKeys, tid, keys_shared);
int2 activeRange = CTASegsortLoop(threadKeys,
threadValues, keys_shared, values_shared, ranges_shared, tid,
make_int2(blockStart, blockEnd), comp);
return activeRange;
}
template
MGPU_DEVICE int2 CTASegsortKeys(KeyType threadKeys[VT], int tid, int headFlags,
KeyType* keys_shared, int* ranges_shared, Comp comp) {
int valuesTemp[VT];
return CTASegsort(threadKeys, valuesTemp, tid,
headFlags, keys_shared, (int*)keys_shared, ranges_shared, comp);
}
template
MGPU_DEVICE int2 CTASegsortPairs(KeyType threadKeys[VT],
ValType threadValues[VT], int tid, int headFlags, KeyType* keys_shared,
ValType* values_shared, int* ranges_shared, Comp comp) {
return CTASegsort(threadKeys, threadValues, tid,
headFlags, keys_shared, values_shared, ranges_shared, comp);
}
////////////////////////////////////////////////////////////////////////////////
// DeviceSegBlocksort
// Load keys and values from global memory, sort in shared memory, and store
// back to global memory. Store the left-most and right-most encountered
// headflag locations to ranges_global to prepare for the next pass.
// This function is factored out of the blocksort kernel to allow easier
// customization of that kernel - we have two implementations currently:
// sort over indices and sort over bitfield.
template
MGPU_DEVICE void DeviceSegBlocksort(InputIt1 keys_global,
InputIt2 values_global, int count2, KeyType* keys_shared,
ValType* values_shared, int* ranges_shared, int headFlags, int tid,
int block, OutputIt1 keysDest_global, OutputIt2 valsDest_global,
int* ranges_global, Comp comp) {
// Load keys into register in thread order.
int gid = NT * VT * block;
KeyType threadKeys[VT];
DeviceGlobalToShared(count2, keys_global + gid, tid, keys_shared);
DeviceSharedToThread(keys_shared, tid, threadKeys);
// Load the values from global memory and into register in thread order.
ValType threadValues[VT];
if(HasValues) {
DeviceGlobalToShared(count2, values_global + gid, tid,
values_shared);
DeviceSharedToThread(values_shared, tid, threadValues);
}
// Run the CTA segmented blocksort.
int2 activeRange = CTASegsort(threadKeys,
threadValues, tid, headFlags, keys_shared, values_shared, ranges_shared,
comp);
// Store the keys to global memory.
DeviceSharedToGlobal(count2, keys_shared, tid,
keysDest_global + gid);
if(HasValues) {
// Store the values to global memory.xk b
DeviceThreadToShared(threadValues, tid, values_shared);
DeviceSharedToGlobal(count2, values_shared, tid,
valsDest_global + gid, false);
}
// Store the 16-bit packed ranges. These are used by all merge kernels and
// the first level of global segmented merge path partitioning.
if(!tid)
ranges_global[block] = bfi(activeRange.y, activeRange.x, 16, 16);
}
////////////////////////////////////////////////////////////////////////////////
// DeviceIndicesToHeadFlags
// Load indices from an array and cooperatively turn into a head flag bitfield
// for each thread.
template
MGPU_DEVICE int DeviceIndicesToHeadFlags(const int* indices_global,
const int* partitions_global, int tid, int block, int count2,
int* words_shared, byte* flags_shared) {
const int FlagWordsPerThread = MGPU_DIV_UP(VT, 4);
int gid = NT * VT * block;
int p0 = partitions_global[block];
int p1 = partitions_global[block + 1];
int headFlags = 0;
if(p1 > p0 || count2 < NT * VT) {
// Clear the flag bytes, then loop through the indices and poke in flag
// values.
#pragma unroll
for(int i = 0; i < FlagWordsPerThread; ++i)
words_shared[NT * i + tid] = 0;
__syncthreads();
for(int index = p0 + tid; index < p1; index += NT) {
int headFlag = indices_global[index];
flags_shared[headFlag - gid] = 1;
}
__syncthreads();
// Combine all the head flags for this thread.
int first = VT * tid;
int offset = first / 4;
int prev = words_shared[offset];
int mask = 0x3210 + 0x1111 * (3 & first);
#pragma unroll
for(int i = 0; i < FlagWordsPerThread; ++i) {
// Gather the next four flags.
int next = words_shared[offset + 1 + i];
int x = prmt(prev, next, mask);
prev = next;
// Set the head flag bits.
if(0x00000001 & x) headFlags |= 1<< (4 * i);
if(0x00000100 & x) headFlags |= 1<< (4 * i + 1);
if(0x00010000 & x) headFlags |= 1<< (4 * i + 2);
if(0x01000000 & x) headFlags |= 1<< (4 * i + 3);
}
__syncthreads();
// Set head flags for out-of-range keys.
int outOfRange = min(VT, first + VT - count2);
if(outOfRange > 0)
headFlags = bfi(0xffffffff, headFlags, VT - outOfRange, outOfRange);
// Clear head flags above VT.
headFlags &= (1<< VT) - 1;
}
return headFlags;
}
////////////////////////////////////////////////////////////////////////////////
// SegSortSupport
struct SegSortSupport {
int* ranges_global;
int2* ranges2_global;
int4* mergeList_global;
int* copyList_global;
int2* queueCounters_global;
int2* nextCounters_global;
byte* copyStatus_global;
};
////////////////////////////////////////////////////////////////////////////////
// DeviceSegSortMerge
template
MGPU_DEVICE void DeviceSegSortMerge(const KeyType* keys_global,
const ValueType* values_global, int2 segmentRange, int tid,
int block, int4 range, int pass, KeyType* keys_shared,
int* indices_shared, KeyType* keysDest_global, ValueType* valsDest_global,
Comp comp) {
const int NV = NT * VT;
int gid = NV * block;
// Load the local compressed segment indices.
int a0 = range.x;
int aCount = range.y - range.x;
int b0 = range.z;
int bCount = range.w - range.z;
DeviceLoad2ToShared(keys_global + a0, aCount, keys_global + b0,
bCount, tid, keys_shared);
////////////////////////////////////////////////////////////////////////////
// Run a merge path to find the starting point for each thread to merge.
// If the entire warp fits into the already-sorted segments, we can skip
// sorting it and leave its keys in shared memory. Doing this on the warp
// level rather than thread level (also legal) gives slightly better
// performance.
int segStart = segmentRange.x;
int segEnd = segmentRange.y;
int listParity = 1 & (block>> pass);
int warpOffset = VT * (~31 & tid);
bool sortWarp = listParity ?
// The spliced segment is to the left (segStart).
(warpOffset < segStart) :
// The spliced segment is to the right (segEnd).
(warpOffset + 32 * VT > segEnd);
KeyType threadKeys[VT];
int indices[VT];
if(sortWarp) {
int diag = VT * tid;
int mp = SegmentedMergePath(keys_shared, 0, aCount, aCount, bCount,
listParity ? 0 : segEnd, listParity ? segStart : NV, diag, comp);
int a0tid = mp;
int a1tid = aCount;
int b0tid = aCount + diag - mp;
int b1tid = aCount + bCount;
// Serial merge into register. All threads in the CTA so we hoist the
// check for list parity outside the function call to simplify the
// logic. Unlike in the blocksort, this does not cause warp divergence.
SegmentedSerialMerge(keys_shared, a0tid, a1tid, b0tid, b1tid,
threadKeys, indices, listParity ? 0 : segEnd,
listParity ? segStart : NV, comp, false);
}
__syncthreads();
// Store sorted data in register back to shared memory. Then copy to global.
if(sortWarp)
DeviceThreadToShared(threadKeys, tid, keys_shared, false);
__syncthreads();
DeviceSharedToGlobal(aCount + bCount, keys_shared, tid,
keysDest_global + gid);
////////////////////////////////////////////////////////////////////////////
// Use the merge indices to gather values from global memory. Store directly
// to valsDest_global.
if(HasValues) {
// Transpose the gather indices to help coalesce loads.
if(sortWarp)
DeviceThreadToShared(indices, tid, indices_shared, false);
else {
#pragma unroll
for(int i = 0; i < VT; ++i)
indices_shared[VT * tid + i] = VT * tid + i;
}
__syncthreads();
DeviceTransferMergeValuesShared(aCount + bCount,
values_global + a0, values_global + b0, aCount, indices_shared,
tid, valsDest_global + NV * block);
}
}
////////////////////////////////////////////////////////////////////////////////
// DeviceSegSortCopy
template
MGPU_DEVICE void DeviceSegSortCopy(const KeyType* keys_global,
const ValueType* values_global, int tid, int block, int count,
KeyType* keysDest_global, ValueType* valsDest_global) {
int gid = NT * VT * block;
int count2 = min(NT * VT, count - gid);
DeviceGlobalToGlobal(count2, keys_global + gid, tid,
keysDest_global + gid);
if(HasValues)
DeviceGlobalToGlobal(count2, values_global + gid, tid,
valsDest_global + gid);
}
} // namespace mgpu
================================================
FILE: 3rdparty/ctc_include/contrib/moderngpu/include/device/ctasortedsearch.cuh
================================================
/******************************************************************************
* Copyright (c) 2013, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
/******************************************************************************
*
* Code and text by Sean Baxter, NVIDIA Research
* See http://nvlabs.github.io/moderngpu for repository and documentation.
*
******************************************************************************/
#pragma once
#include "../mgpudevice.cuh"
#include "ctasearch.cuh"
namespace mgpu {
////////////////////////////////////////////////////////////////////////////////
// DeviceSerialSearch
template
MGPU_DEVICE int3 DeviceSerialSearch(const T* keys_shared, int aBegin,
int aEnd, int bBegin, int bEnd, int aOffset, int bOffset, int* indices,
Comp comp) {
const int FlagA = IndexA ? 0x80000000 : 1;
const int FlagB = IndexB ? 0x80000000 : 1;
T aKey = keys_shared[aBegin];
T bKey = keys_shared[bBegin];
T aPrev, bPrev;
if(aBegin > 0) aPrev = keys_shared[aBegin - 1];
if(bBegin > 0) bPrev = keys_shared[bBegin - 1];
int decisions = 0;
int matchCountA = 0;
int matchCountB = 0;
#pragma unroll
for(int i = 0; i < VT; ++i) {
bool p;
if(RangeCheck && aBegin >= aEnd) p = false;
else if(RangeCheck && bBegin >= bEnd) p = true;
else p = (MgpuBoundsUpper == Bounds) ?
comp(aKey, bKey) :
!comp(bKey, aKey);
if(p) {
// aKey is smaller than bKey, so it is inserted before bKey.
// Save bKey's index (bBegin + first) as the result of the search
// and advance to the next needle in A.
bool match = false;
if(MatchA) {
// Test if there is an element in B that matches aKey.
if(MgpuBoundsUpper == Bounds) {
// Upper Bound: We're inserting aKey after bKey. If there
// is a match for aKey it must be bPrev. Check that bPrev
// is in range and equal to aKey.
// The predicate test result !comp(aKey, bPrev) was
// established on the previous A-advancing iteration (it
// failed the comp(aKey, bKey) test to get us to this
// point). Check the other half of the equality condition
// with a second comparison.
bool inRange = !RangeCheck || (bBegin > aEnd);
match = inRange && !comp(bPrev, aKey);
} else {
// Lower Bound: We're inserting aKey before bKey. If there
// is a match for aKey, it must be bKey. Check that bKey
// is in range and equal to aKey.
// The predicate test !comp(bKey, aKey) has established one
// half of the equality condition. We establish the other
// half with a second comparison.
bool inRange = !RangeCheck || (bBegin < bEnd);
match = inRange && !comp(aKey, bKey);
}
}
int index = 0;
if(IndexA) index = bOffset + bBegin;
if(match) index |= FlagA;
if(IndexA || MatchA) indices[i] = index;
matchCountA += match;
// Mark the decision bit to indicate that this iteration has
// progressed A (the needles).
decisions |= 1<< i;
aPrev = aKey;
aKey = keys_shared[++aBegin];
} else {
// aKey is larger than bKey, so it is inserted after bKey (but we
// don't know where yet). Advance the B index to the next element in
// the haystack to continue the search for the current needle.
bool match = false;
if(MatchB) {
if(MgpuBoundsUpper == Bounds) {
// Upper Bound: aKey is not smaller than bKey. We advance to
// the next haystack element in B. If there is a match in A
// for bKey it must be aKey. By entering this branch we've
// verified that !comp(aKey, bKey). Making the reciprocal
// comparison !comp(bKey, aKey) establishes aKey == bKey.
bool inRange = !RangeCheck ||
((bBegin < bEnd) && (aBegin < aEnd));
match = inRange && !comp(bKey, aKey);
} else {
// Lower Bound: bKey is smaller than aKey. We advance to the
// next element in B. If there is a match for bKey, it must
// be aPrev. The previous A-advancing iteration proved that
// !comp(bKey, aPrev). We test !comp(aPrev, bKey) for the
// other half of the equality condition.
bool inRange = !RangeCheck ||
((bBegin < bEnd) && (aBegin > 0));
match = inRange && !comp(aPrev, bKey);
}
}
int index = 0;
if(IndexB) index = aOffset + aBegin;
if(match) index |= FlagB;
if(IndexB || MatchB) indices[i] = index;
matchCountB += match;
// Keep the decision bit cleared to indicate that this iteration
// has progressed B (the haystack).
bPrev = bKey;
bKey = keys_shared[++bBegin];
}
}
return make_int3(decisions, matchCountA, matchCountB);
}
////////////////////////////////////////////////////////////////////////////////
// CTASortedSearch
// Take keys in shared memory and return indices and b-match flags in shared
// memory.
// NOTE: This function doesn't do any strided-to-thread order transposes so
// using an even number of values per thread will incur no additional bank
// conflicts.
template
MGPU_DEVICE int2 CTASortedSearch(T* keys_shared, int aStart, int aCount,
int aEnd, int a0, int bStart, int bCount, int bEnd, int b0, bool extended,
int tid, int* indices_shared, Comp comp) {
// Run a merge path to find the start of the serial search for each thread.
int diag = VT * tid;
int mp = MergePath(keys_shared + aStart, aCount,
keys_shared + bStart, bCount, diag, comp);
int a0tid = mp;
int b0tid = diag - mp;
// Serial search into register.
int3 results;
int indices[VT];
if(extended)
results = DeviceSerialSearch(keys_shared, a0tid + aStart, aEnd, b0tid + bStart, bEnd,
a0 - aStart, b0 - bStart, indices, comp);
else
results = DeviceSerialSearch(keys_shared, a0tid + aStart, aEnd, b0tid + bStart, bEnd,
a0 - aStart, b0 - bStart, indices, comp);
__syncthreads();
// Compact the indices into shared memory. Use the decision bits (set is A,
// cleared is B) to select the destination.
int decisions = results.x;
b0tid += aCount;
#pragma unroll
for(int i = 0; i < VT; ++i) {
if((1<< i) & decisions) {
if(IndexA || MatchA) indices_shared[a0tid++] = indices[i];
} else {
if(IndexB || MatchB) indices_shared[b0tid++] = indices[i];
}
}
__syncthreads();
// Return the match counts for A and B keys.
return make_int2(results.y, results.z);
}
} // namespace mgpu
================================================
FILE: 3rdparty/ctc_include/contrib/moderngpu/include/device/devicetypes.cuh
================================================
/******************************************************************************
* Copyright (c) 2013, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
/******************************************************************************
*
* Code and text by Sean Baxter, NVIDIA Research
* See http://nvlabs.github.io/moderngpu for repository and documentation.
*
******************************************************************************/
#pragma once
#if __CUDA_ARCH__ == 100
#error "COMPUTE CAPABILITY 1.0 NOT SUPPORTED BY MPGU. TRY 2.0!"
#endif
#include
#include "../util/static.h"
#ifdef _MSC_VER
#define INLINESYMBOL __forceinline__
#else
#define INLINESYMBOL inline
#endif
namespace mgpu {
#define MGPU_HOST __host__ INLINESYMBOL
#define MGPU_DEVICE __device__ INLINESYMBOL
#define MGPU_HOST_DEVICE __host__ __device__ INLINESYMBOL
const int WARP_SIZE = 32;
const int LOG_WARP_SIZE = 5;
////////////////////////////////////////////////////////////////////////////////
// Device-side comparison operators
template
struct less : public std::binary_function {
MGPU_HOST_DEVICE bool operator()(T a, T b) { return a < b; }
};
template
struct less_equal : public std::binary_function {
MGPU_HOST_DEVICE bool operator()(T a, T b) { return a <= b; }
};
template
struct greater : public std::binary_function {
MGPU_HOST_DEVICE bool operator()(T a, T b) { return a > b; }
};
template
struct greater_equal : public std::binary_function {
MGPU_HOST_DEVICE bool operator()(T a, T b) { return a >= b; }
};
template
struct equal_to : public std::binary_function {
MGPU_HOST_DEVICE bool operator()(T a, T b) { return a == b; }
};
template
struct not_equal_to : public std::binary_function {
MGPU_HOST_DEVICE bool operator()(T a, T b) { return a != b; }
};
////////////////////////////////////////////////////////////////////////////////
// Device-side arithmetic operators
template
struct plus : public std::binary_function {
MGPU_HOST_DEVICE T operator()(T a, T b) { return a + b; }
};
template
struct minus : public std::binary_function {
MGPU_HOST_DEVICE T operator()(T a, T b) { return a - b; }
};
template
struct multiplies : public std::binary_function {
MGPU_HOST_DEVICE T operator()(T a, T b) { return a * b; }
};
template
struct modulus : public std::binary_function {
MGPU_HOST_DEVICE T operator()(T a, T b) { return a % b; }
};
template
struct bit_or : public std::binary_function {
MGPU_HOST_DEVICE T operator()(T a, T b) { return a | b; }
};
template
struct bit_and : public std::binary_function {
MGPU_HOST_DEVICE T operator()(T a, T b) { return a & b; }
};
template
struct bit_xor : public std::binary_function {
MGPU_HOST_DEVICE T operator()(T a, T b) { return a ^ b; }
};
template
struct maximum : public std::binary_function {
MGPU_HOST_DEVICE T operator()(T a, T b) { return max(a, b); }
};
template
struct minimum : public std::binary_function {
MGPU_HOST_DEVICE T operator()(T a, T b) { return min(a, b); }
};
////////////////////////////////////////////////////////////////////////////////
template
MGPU_HOST_DEVICE void swap(T& a, T& b) {
T c = a;
a = b;
b = c;
}
template
struct DevicePair {
T x, y;
};
template
MGPU_HOST_DEVICE DevicePair MakeDevicePair(T x, T y) {
DevicePair p = { x, y };
return p;
}
template struct numeric_limits;
template<> struct numeric_limits {
MGPU_HOST_DEVICE static int min() { return INT_MIN; }
MGPU_HOST_DEVICE static int max() { return INT_MAX; }
MGPU_HOST_DEVICE static int lowest() { return INT_MIN; }
MGPU_HOST_DEVICE static int AddIdent() { return 0; }
MGPU_HOST_DEVICE static int MulIdent() { return 1; }
};
template<> struct numeric_limits {
MGPU_HOST_DEVICE static long long min() { return LLONG_MIN; }
MGPU_HOST_DEVICE static long long max() { return LLONG_MAX; }
MGPU_HOST_DEVICE static long long lowest() { return LLONG_MIN; }
MGPU_HOST_DEVICE static long long AddIdent() { return 0; }
MGPU_HOST_DEVICE static long long MulIdent() { return 1; }
};
template<> struct numeric_limits {
MGPU_HOST_DEVICE static uint min() { return 0; }
MGPU_HOST_DEVICE static uint max() { return UINT_MAX; }
MGPU_HOST_DEVICE static uint lowest() { return 0; }
MGPU_HOST_DEVICE static uint AddIdent() { return 0; }
MGPU_HOST_DEVICE static uint MulIdent() { return 1; }
};
template<> struct numeric_limits {
MGPU_HOST_DEVICE static unsigned long long min() { return 0; }
MGPU_HOST_DEVICE static unsigned long long max() { return ULLONG_MAX; }
MGPU_HOST_DEVICE static unsigned long long lowest() { return 0; }
MGPU_HOST_DEVICE static unsigned long long AddIdent() { return 0; }
MGPU_HOST_DEVICE static unsigned long long MulIdent() { return 1; }
};
template<> struct numeric_limits {
MGPU_HOST_DEVICE static float min() { return FLT_MIN; }
MGPU_HOST_DEVICE static float max() { return FLT_MAX; }
MGPU_HOST_DEVICE static float lowest() { return -FLT_MAX; }
MGPU_HOST_DEVICE static float AddIdent() { return 0; }
MGPU_HOST_DEVICE static float MulIdent() { return 1; }
};
template<> struct numeric_limits {
MGPU_HOST_DEVICE static double min() { return DBL_MIN; }
MGPU_HOST_DEVICE static double max() { return DBL_MAX; }
MGPU_HOST_DEVICE static double lowest() { return -DBL_MAX; }
MGPU_HOST_DEVICE static double AddIdent() { return 0; }
MGPU_HOST_DEVICE static double MulIdent() { return 1; }
};
MGPU_HOST_DEVICE int2 operator+(int2 a, int2 b) {
return make_int2(a.x + b.x, a.y + b.y);
}
MGPU_HOST_DEVICE int2& operator+=(int2& a, int2 b) {
a = a + b;
return a;
}
MGPU_HOST_DEVICE int2 operator*(int2 a, int2 b) {
return make_int2(a.x * b.x, a.y * b.y);
}
MGPU_HOST_DEVICE int2& operator*=(int2& a, int2 b) {
a = a * b;
return a;
}
template
MGPU_HOST_DEVICE T max(T a, T b) {
#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ < 100)
return std::max(a, b);
#else
return (a < b) ? b : a;
#endif
}
template
MGPU_HOST_DEVICE T min(T a, T b) {
#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ < 100)
return std::min(a, b);
#else
return (b < a) ? b : a;
#endif
}
MGPU_HOST_DEVICE int2 max(int2 a, int2 b) {
return make_int2(max(a.x, b.x), max(a.y, b.y));
}
MGPU_HOST_DEVICE int2 min(int2 a, int2 b) {
return make_int2(min(a.x, b.x), min(a.y, b.y));
}
template<> struct numeric_limits {
MGPU_HOST_DEVICE static int2 min() { return make_int2(INT_MIN, INT_MIN); }
MGPU_HOST_DEVICE static int2 max() { return make_int2(INT_MAX, INT_MAX); }
MGPU_HOST_DEVICE static int2 lowest() {
return make_int2(INT_MIN, INT_MIN);
}
MGPU_HOST_DEVICE static int2 AddIdent() { return make_int2(0, 0); }
MGPU_HOST_DEVICE static int2 MulIdent() { return make_int2(1, 1); }
};
template
class constant_iterator : public std::iterator_traits {
public:
MGPU_HOST_DEVICE constant_iterator(T value) : _value(value) { }
MGPU_HOST_DEVICE T operator[](ptrdiff_t i) const {
return _value;
}
MGPU_HOST_DEVICE T operator*() const {
return _value;
}
MGPU_HOST_DEVICE constant_iterator operator+(ptrdiff_t diff) const {
return constant_iterator(_value);
}
MGPU_HOST_DEVICE constant_iterator operator-(ptrdiff_t diff) const {
return constant_iterator(_value);
}
MGPU_HOST_DEVICE constant_iterator& operator+=(ptrdiff_t diff) {
return *this;
}
MGPU_HOST_DEVICE constant_iterator& operator-=(ptrdiff_t diff) {
return *this;
}
private:
T _value;
};
template
class counting_iterator : public std::iterator_traits {
public:
MGPU_HOST_DEVICE counting_iterator(T value) : _value(value) { }
MGPU_HOST_DEVICE T operator[](ptrdiff_t i) {
return _value + i;
}
MGPU_HOST_DEVICE T operator*() {
return _value;
}
MGPU_HOST_DEVICE counting_iterator operator+(ptrdiff_t diff) {
return counting_iterator(_value + diff);
}
MGPU_HOST_DEVICE counting_iterator operator-(ptrdiff_t diff) {
return counting_iterator(_value - diff);
}
MGPU_HOST_DEVICE counting_iterator& operator+=(ptrdiff_t diff) {
_value += diff;
return *this;
}
MGPU_HOST_DEVICE counting_iterator& operator-=(ptrdiff_t diff) {
_value -= diff;
return *this;
}
private:
T _value;
};
template
class step_iterator : public std::iterator_traits {
public:
MGPU_HOST_DEVICE step_iterator(T base, T step) :
_base(base), _step(step), _offset(0) { }
MGPU_HOST_DEVICE T operator[](ptrdiff_t i) {
return _base + (_offset + i) * _step;
}
MGPU_HOST_DEVICE T operator*() {
return _base + _offset * _step;
}
MGPU_HOST_DEVICE step_iterator operator+(ptrdiff_t diff) {
step_iterator it = *this;
it._offset += diff;
return it;
}
MGPU_HOST_DEVICE step_iterator operator-(ptrdiff_t diff) {
step_iterator it = *this;
it._offset -= diff;
return it;
}
MGPU_HOST_DEVICE step_iterator& operator+=(ptrdiff_t diff) {
_offset += diff;
return *this;
}
MGPU_HOST_DEVICE step_iterator& operator-=(ptrdiff_t diff) {
_offset -= diff;
return *this;
}
private:
ptrdiff_t _offset;
T _base, _step;
};
} // namespace mgpu
template
MGPU_HOST_DEVICE mgpu::counting_iterator operator+(ptrdiff_t diff,
mgpu::counting_iterator it) {
return it + diff;
}
template
MGPU_HOST_DEVICE mgpu::counting_iterator operator-(ptrdiff_t diff,
mgpu::counting_iterator it) {
return it + (-diff);
}
template
MGPU_HOST_DEVICE mgpu::step_iterator operator+(ptrdiff_t diff,
mgpu::step_iterator it) {
return it + diff;
}
template
MGPU_HOST_DEVICE mgpu::step_iterator operator-(ptrdiff_t diff,
mgpu::step_iterator it) {
return it + (-diff);
}
================================================
FILE: 3rdparty/ctc_include/contrib/moderngpu/include/device/deviceutil.cuh
================================================
/******************************************************************************
* Copyright (c) 2013, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
/******************************************************************************
*
* Code and text by Sean Baxter, NVIDIA Research
* See http://nvlabs.github.io/moderngpu for repository and documentation.
*
******************************************************************************/
#pragma once
#include "intrinsics.cuh"
namespace mgpu {
// Get the difference between two pointers in bytes.
MGPU_HOST_DEVICE ptrdiff_t PtrDiff(const void* a, const void* b) {
return (const byte*)b - (const byte*)a;
}
// Offset a pointer by i bytes.
template
MGPU_HOST_DEVICE const T* PtrOffset(const T* p, ptrdiff_t i) {
return (const T*)((const byte*)p + i);
}
template
MGPU_HOST_DEVICE T* PtrOffset(T* p, ptrdiff_t i) {
return (T*)((byte*)p + i);
}
////////////////////////////////////////////////////////////////////////////////
// Task range support
// Evenly distributes variable-length arrays over a fixed number of CTAs.
MGPU_HOST int2 DivideTaskRange(int numItems, int numWorkers) {
div_t d = div(numItems, numWorkers);
return make_int2(d.quot, d.rem);
}
MGPU_HOST_DEVICE int2 ComputeTaskRange(int block, int2 task) {
int2 range;
range.x = task.x * block;
range.x += min(block, task.y);
range.y = range.x + task.x + (block < task.y);
return range;
}
MGPU_HOST_DEVICE int2 ComputeTaskRange(int block, int2 task, int blockSize,
int count) {
int2 range = ComputeTaskRange(block, task);
range.x *= blockSize;
range.y = min(count, range.y * blockSize);
return range;
}
////////////////////////////////////////////////////////////////////////////////
// DeviceExtractHeadFlags
// Input array flags is a bit array with 32 head flags per word.
// ExtractThreadHeadFlags returns numBits flags starting at bit index.
MGPU_HOST_DEVICE uint DeviceExtractHeadFlags(const uint* flags, int index,
int numBits) {
int index2 = index>> 5;
int shift = 31 & index;
uint headFlags = flags[index2]>> shift;
int shifted = 32 - shift;
if(shifted < numBits)
// We also need to shift in the next set of bits.
headFlags = bfi(flags[index2 + 1], headFlags, shifted, shift);
headFlags &= (1<< numBits) - 1;
return headFlags;
}
////////////////////////////////////////////////////////////////////////////////
// DevicePackHeadFlags
// Pack VT bits per thread at 32 bits/thread. Will consume an integer number of
// words, because CTA size is a multiple of 32. The first NT * VT / 32 threads
// return packed words.
template
MGPU_DEVICE uint DevicePackHeadFlags(uint threadBits, int tid,
uint* flags_shared) {
const int WordCount = NT * VT / 32;
// Each thread stores its thread bits to flags_shared[tid].
flags_shared[tid] = threadBits;
__syncthreads();
uint packed = 0;
if(tid < WordCount) {
const int Items = MGPU_DIV_UP(32, VT);
int index = 32 * tid;
int first = index / VT;
int bit = 0;
int rem = index - VT * first;
packed = flags_shared[first]>> rem;
bit = VT - rem;
++first;
#pragma unroll
for(int i = 0; i < Items; ++i) {
if(i < Items - 1 || bit < 32) {
uint x = flags_shared[first + i];
if(bit < 32) packed |= x<< bit;
bit += VT;
}
}
}
__syncthreads();
return packed;
}
} // namespace mgpu
================================================
FILE: 3rdparty/ctc_include/contrib/moderngpu/include/device/intrinsics.cuh
================================================
/******************************************************************************
* Copyright (c) 2013, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
/******************************************************************************
*
* Code and text by Sean Baxter, NVIDIA Research
* See http://nvlabs.github.io/moderngpu for repository and documentation.
*
******************************************************************************/
#include "devicetypes.cuh"
#pragma once
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
namespace mgpu {
MGPU_HOST_DEVICE uint2 ulonglong_as_uint2(uint64 x) {
return *reinterpret_cast(&x);
}
MGPU_HOST_DEVICE uint64 uint2_as_ulonglong(uint2 x) {
return *reinterpret_cast(&x);
}
MGPU_HOST_DEVICE int2 longlong_as_int2(int64 x) {
return *reinterpret_cast(&x);
}
MGPU_HOST_DEVICE int64 int2_as_longlong(int2 x) {
return *reinterpret_cast(&x);
}
MGPU_HOST_DEVICE int2 double_as_int2(double x) {
return *reinterpret_cast(&x);
}
MGPU_HOST_DEVICE double int2_as_double(int2 x) {
return *reinterpret_cast(&x);
}
MGPU_HOST_DEVICE void SetDoubleX(double& d, int x) {
reinterpret_cast(&d)[0] = x;
}
MGPU_HOST_DEVICE int GetDoubleX(double d) {
return double_as_int2(d).x;
}
MGPU_HOST_DEVICE void SetDoubleY(double& d, int y) {
reinterpret_cast(&d)[1] = y;
}
MGPU_HOST_DEVICE int GetDoubleY(double d) {
return double_as_int2(d).y;
}
////////////////////////////////////////////////////////////////////////////////
// PTX for bfe and bfi
#if __CUDA_ARCH__ >= 200
MGPU_DEVICE uint bfe_ptx(uint x, uint bit, uint numBits) {
uint result;
asm("bfe.u32 %0, %1, %2, %3;" :
"=r"(result) : "r"(x), "r"(bit), "r"(numBits));
return result;
}
MGPU_DEVICE uint bfi_ptx(uint x, uint y, uint bit, uint numBits) {
uint result;
asm("bfi.b32 %0, %1, %2, %3, %4;" :
"=r"(result) : "r"(x), "r"(y), "r"(bit), "r"(numBits));
return result;
}
MGPU_DEVICE uint prmt_ptx(uint a, uint b, uint index) {
uint ret;
asm("prmt.b32 %0, %1, %2, %3;" : "=r"(ret) : "r"(a), "r"(b), "r"(index));
return ret;
}
#endif // __CUDA_ARCH__ >= 200
#if CUDA_VERSION >= 9000
////////////////////////////////////////////////////////////////////////////////
// shfl_add
MGPU_DEVICE int shfl_add(int x, int offset, int width = WARP_SIZE, unsigned int threadmask = 0xFFFFFFFF) {
int result = 0;
#if __CUDA_ARCH__ >= 300
int mask = (WARP_SIZE - width)<< 8;
asm(
"{.reg .s32 r0;"
".reg .pred p;"
"shfl.sync.up.b32 r0|p, %1, %2, %3, %4;"
"@p add.s32 r0, r0, %5;"
"mov.s32 %0, r0; }"
: "=r"(result) : "r"(x), "r"(offset), "r"(mask), "r"(threadmask), "r"(x));
#endif
return result;
}
MGPU_DEVICE int shfl_max(int x, int offset, int width = WARP_SIZE, unsigned int threadmask = 0xFFFFFFFF) {
int result = 0;
#if __CUDA_ARCH__ >= 300
int mask = (WARP_SIZE - width)<< 8;
asm(
"{.reg .s32 r0;"
".reg .pred p;"
"shfl.sync.up.b32 r0|p, %1, %2, %3, %4;"
"@p max.s32 r0, r0, %5;"
"mov.s32 %0, r0; }"
: "=r"(result) : "r"(x), "r"(offset), "r"(mask), "r"(threadmask), "r"(x));
#endif
return result;
}
#else
////////////////////////////////////////////////////////////////////////////////
// shfl_add
MGPU_DEVICE int shfl_add(int x, int offset, int width = WARP_SIZE) {
int result = 0;
#if __CUDA_ARCH__ >= 300
int mask = (WARP_SIZE - width)<< 8;
asm(
"{.reg .s32 r0;"
".reg .pred p;"
"shfl.up.b32 r0|p, %1, %2, %3;"
"@p add.s32 r0, r0, %4;"
"mov.s32 %0, r0; }"
: "=r"(result) : "r"(x), "r"(offset), "r"(mask), "r"(x));
#endif
return result;
}
MGPU_DEVICE int shfl_max(int x, int offset, int width = WARP_SIZE) {
int result = 0;
#if __CUDA_ARCH__ >= 300
int mask = (WARP_SIZE - width)<< 8;
asm(
"{.reg .s32 r0;"
".reg .pred p;"
"shfl.up.b32 r0|p, %1, %2, %3;"
"@p max.s32 r0, r0, %4;"
"mov.s32 %0, r0; }"
: "=r"(result) : "r"(x), "r"(offset), "r"(mask), "r"(x));
#endif
return result;
}
#endif
////////////////////////////////////////////////////////////////////////////////
// brev, popc, clz, bfe, bfi, prmt
// Reverse the bits in an integer.
MGPU_HOST_DEVICE uint brev(uint x) {
#if __CUDA_ARCH__ >= 200
uint y = __brev(x);
#else
uint y = 0;
for(int i = 0; i < 32; ++i)
y |= (1 & (x>> i))<< (31 - i);
#endif
return y;
}
// Count number of bits in a register.
MGPU_HOST_DEVICE int popc(uint x) {
#if __CUDA_ARCH__ >= 200
return __popc(x);
#else
int c;
for(c = 0; x; ++c)
x &= x - 1;
return c;
#endif
}
// Count leading zeros - start from most significant bit.
MGPU_HOST_DEVICE int clz(int x) {
#if __CUDA_ARCH__ >= 200
return __clz(x);
#else
for(int i = 31; i >= 0; --i)
if((1<< i) & x) return 31 - i;
return 32;
#endif
}
// Find first set - start from least significant bit. LSB is 1. ffs(0) is 0.
MGPU_HOST_DEVICE int ffs(int x) {
#if __CUDA_ARCH__ >= 200
return __ffs(x);
#else
for(int i = 0; i < 32; ++i)
if((1<< i) & x) return i + 1;
return 0;
#endif
}
MGPU_HOST_DEVICE uint bfe(uint x, uint bit, uint numBits) {
#if __CUDA_ARCH__ >= 200
return bfe_ptx(x, bit, numBits);
#else
return ((1<< numBits) - 1) & (x>> bit);
#endif
}
MGPU_HOST_DEVICE uint bfi(uint x, uint y, uint bit, uint numBits) {
uint result;
#if __CUDA_ARCH__ >= 200
result = bfi_ptx(x, y, bit, numBits);
#else
if(bit + numBits > 32) numBits = 32 - bit;
uint mask = ((1<< numBits) - 1)<< bit;
result = y & ~mask;
result |= mask & (x<< bit);
#endif
return result;
}
MGPU_HOST_DEVICE uint prmt(uint a, uint b, uint index) {
uint result;
#if __CUDA_ARCH__ >= 200
result = prmt_ptx(a, b, index);
#else
result = 0;
for(int i = 0; i < 4; ++i) {
uint sel = 0xf & (index>> (4 * i));
uint x = ((7 & sel) > 3) ? b : a;
x = 0xff & (x>> (8 * (3 & sel)));
if(8 & sel) x = (128 & x) ? 0xff : 0;
result |= x<< (8 * i);
}
#endif
return result;
}
// Find log2(x) and optionally round up to the next integer logarithm.
MGPU_HOST_DEVICE int FindLog2(int x, bool roundUp = false) {
int a = 31 - clz(x);
if(roundUp) a += !MGPU_IS_POW_2(x);
return a;
}
////////////////////////////////////////////////////////////////////////////////
// vset4
#if __CUDA_ARCH__ >= 300
// Performs four byte-wise comparisons and returns 1 for each byte that
// satisfies the conditional, and zero otherwise.
MGPU_DEVICE uint vset4_lt_add_ptx(uint a, uint b, uint c) {
uint result;
asm("vset4.u32.u32.lt.add %0, %1, %2, %3;" :
"=r"(result) : "r"(a), "r"(b), "r"(c));
return result;
}
MGPU_DEVICE uint vset4_eq_ptx(uint a, uint b) {
uint result;
asm("vset4.u32.u32.eq %0, %1, %2, %3;" :
"=r"(result) : "r"(a), "r"(b), "r"(0));
return result;
}
#endif // __CUDA_ARCH__ >= 300
MGPU_HOST_DEVICE uint vset4_lt_add(uint a, uint b, uint c) {
uint result;
#if __CUDA_ARCH__ >= 300
result = vset4_lt_add_ptx(a, b, c);
#else
result = c;
if((0x000000ff & a) < (0x000000ff & b)) result += 0x00000001;
if((0x0000ff00 & a) < (0x0000ff00 & b)) result += 0x00000100;
if((0x00ff0000 & a) < (0x00ff0000 & b)) result += 0x00010000;
if((0xff000000 & a) < (0xff000000 & b)) result += 0x01000000;
#endif
return result;
}
MGPU_HOST_DEVICE uint vset4_eq(uint a, uint b) {
uint result;
#if __CUDA_ARCH__ >= 300
result = vset4_eq_ptx(a, b);
#else
result = 0;
if((0x000000ff & a) == (0x000000ff & b)) result = 0x00000001;
if((0x0000ff00 & a) == (0x0000ff00 & b)) result += 0x00000100;
if((0x00ff0000 & a) == (0x00ff0000 & b)) result += 0x00010000;
if((0xff000000 & a) == (0xff000000 & b)) result += 0x01000000;
#endif
return result;
}
////////////////////////////////////////////////////////////////////////////////
//
MGPU_HOST_DEVICE uint umulhi(uint x, uint y) {
#if __CUDA_ARCH__ >= 100
return __umulhi(x, y);
#else
uint64 product = (uint64)x * y;
return (uint)(product>> 32);
#endif
}
////////////////////////////////////////////////////////////////////////////////
// ldg() function defined for all devices and all types. Only compiles to __ldg
// intrinsic for __CUDA_ARCH__ >= 320 && __CUDA_ARCH__ < 400 for types supported
// by __ldg in sm_32_intrinsics.h
template
struct IsLdgType {
enum { value = false };
};
#define DEFINE_LDG_TYPE(T) \
template<> struct IsLdgType { enum { value = true }; };
template::value>
struct LdgShim {
MGPU_DEVICE static T Ldg(const T* p) {
return *p;
}
};
#if __CUDA_ARCH__ >= 320 && __CUDA_ARCH__ < 400
// List of __ldg-compatible types from sm_32_intrinsics.h.
DEFINE_LDG_TYPE(char)
DEFINE_LDG_TYPE(short)
DEFINE_LDG_TYPE(int)
DEFINE_LDG_TYPE(long long)
DEFINE_LDG_TYPE(char2)
DEFINE_LDG_TYPE(char4)
DEFINE_LDG_TYPE(short2)
DEFINE_LDG_TYPE(short4)
DEFINE_LDG_TYPE(int2)
DEFINE_LDG_TYPE(int4)
DEFINE_LDG_TYPE(longlong2)
DEFINE_LDG_TYPE(unsigned char)
DEFINE_LDG_TYPE(unsigned short)
DEFINE_LDG_TYPE(unsigned int)
DEFINE_LDG_TYPE(unsigned long long)
DEFINE_LDG_TYPE(uchar2)
DEFINE_LDG_TYPE(uchar4)
DEFINE_LDG_TYPE(ushort2)
DEFINE_LDG_TYPE(ushort4)
DEFINE_LDG_TYPE(uint2)
DEFINE_LDG_TYPE(uint4)
DEFINE_LDG_TYPE(ulonglong2)
DEFINE_LDG_TYPE(float)
DEFINE_LDG_TYPE(double)
DEFINE_LDG_TYPE(float2)
DEFINE_LDG_TYPE(float4)
DEFINE_LDG_TYPE(double2)
template struct LdgShim {
MGPU_DEVICE static T Ldg(const T* p) {
return __ldg(p);
}
};
#endif
template
MGPU_DEVICE T ldg(const T* p) {
return LdgShim::Ldg(p);
}
////////////////////////////////////////////////////////////////////////////////
// Fast division for 31-bit integers.
// Uses the method in Hacker's Delight (2nd edition) page 228.
// Evaluates for denom > 1 and x < 2^31.
struct FastDivide {
uint denom;
uint coef;
uint shift;
MGPU_HOST_DEVICE uint Divide(uint x) {
return umulhi(x, coef)>> shift;
}
MGPU_HOST_DEVICE uint Modulus(uint x) {
return x - Divide(x) * denom;
}
explicit FastDivide(uint denom_) {
denom = denom_;
uint p = 31 + FindLog2(denom, true);
coef = (uint)(((1ull<< p) + denom - 1) / denom);
shift = p - 32;
}
};
#pragma GCC diagnostic pop
} // namespace mgpu
================================================
FILE: 3rdparty/ctc_include/contrib/moderngpu/include/device/loadstore.cuh
================================================
/******************************************************************************
* Copyright (c) 2013, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
/******************************************************************************
*
* Code and text by Sean Baxter, NVIDIA Research
* See http://nvlabs.github.io/moderngpu for repository and documentation.
*
******************************************************************************/
#pragma once
#include "../mgpudevice.cuh"
#include "deviceutil.cuh"
#include "intrinsics.cuh"
namespace mgpu {
////////////////////////////////////////////////////////////////////////////////
// Cooperative load functions.
template
MGPU_DEVICE void DeviceSharedToReg(InputIt data, int tid, T* reg,
bool sync) {
#pragma unroll
for(int i = 0; i < VT; ++i)
reg[i] = data[NT * i + tid];
if(sync) __syncthreads();
}
template
MGPU_DEVICE void DeviceGlobalToRegPred(int count, InputIt data, int tid,
T* reg, bool sync) {
// TODO: Attempt to issue 4 loads at a time.
#pragma unroll
for(int i = 0; i < VT; ++i) {
int index = NT * i + tid;
if(index < count) reg[i] = data[index];
}
if(sync) __syncthreads();
}
template
MGPU_DEVICE void DeviceGlobalToReg(int count, InputIt data, int tid,
T* reg, bool sync) {
if(count >= NT * VT) {
#pragma unroll
for(int i = 0; i < VT; ++i)
reg[i] = data[NT * i + tid];
} else
DeviceGlobalToRegPred(count, data, tid, reg, false);
if(sync) __syncthreads();
}
template
MGPU_DEVICE void DeviceGlobalToReg2(int count, InputIt data, int tid,
T* reg, bool sync) {
DeviceGlobalToReg(count, data, tid, reg, false);
#pragma unroll
for(int i = VT0; i < VT1; ++i) {
int index = NT * i + tid;
if(index < count) reg[i] = data[index];
}
if(sync) __syncthreads();
}
template
MGPU_DEVICE void DeviceGlobalToRegDefault(int count, InputIt data, int tid,
T* reg, T init, bool sync) {
if(count >= NT * VT) {
#pragma unroll
for(int i = 0; i < VT; ++i)
reg[i] = data[NT * i + tid];
} else {
#pragma unroll
for(int i = 0; i < VT; ++i) {
int index = NT * i + tid;
reg[i] = init;
if(index < count) reg[i] = data[index];
}
}
if(sync) __syncthreads();
}
template
MGPU_DEVICE void DeviceGlobalToRegDefault2(int count, InputIt data, int tid,
T* reg, T init, bool sync) {
DeviceGlobalToRegDefault(count, data, tid, reg, init, false);
#pragma unroll
for(int i = VT0; i < VT1; ++i) {
int index = NT * i + tid;
reg[i] = init;
if(index < count) reg[i] = data[index];
}
if(sync) __syncthreads();
}
////////////////////////////////////////////////////////////////////////////////
template
MGPU_DEVICE void DeviceGlobalToThread(int count, InputIt data, int tid,
T* reg) {
data += VT * tid;
if(count >= NT * VT) {
#pragma unroll
for(int i = 0; i < VT; ++i)
reg[i] = ldg(data + i);
} else {
count -= VT * tid;
#pragma unroll
for(int i = 0; i < VT; ++i)
if(i < count) reg[i] = ldg(data + i);
}
}
template
MGPU_DEVICE void DeviceGlobalToThreadDefault(int count, InputIt data, int tid,
T* reg, T init) {
data += VT * tid;
if(count >= NT * VT) {
#pragma unroll
for(int i = 0; i < VT; ++i)
reg[i] = ldg(data + i);
} else {
count -= VT * tid;
#pragma unroll
for(int i = 0; i < VT; ++i)
reg[i] = (i < count) ? ldg(data + i) : init;
}
}
////////////////////////////////////////////////////////////////////////////////
// Cooperative store functions.
template
MGPU_DEVICE void DeviceRegToShared(const T* reg, int tid,
OutputIt dest, bool sync) {
typedef typename std::iterator_traits::value_type T2;
#pragma unroll
for(int i = 0; i < VT; ++i)
dest[NT * i + tid] = (T2)reg[i];
if(sync) __syncthreads();
}
template
MGPU_DEVICE void DeviceRegToGlobal(int count, const T* reg, int tid,
OutputIt dest, bool sync) {
#pragma unroll
for(int i = 0; i < VT; ++i) {
int index = NT * i + tid;
if(index < count)
dest[index] = reg[i];
}
if(sync) __syncthreads();
}
////////////////////////////////////////////////////////////////////////////////
// DeviceMemToMemLoop
// Transfer from shared memory to global, or global to shared, for transfers
// that are smaller than NT * VT in the average case. The goal is to reduce
// unnecessary comparison logic.
template
MGPU_DEVICE void DeviceMemToMem4(int count, InputIt source, int tid,
OutputIt dest, bool sync) {
typedef typename std::iterator_traits::value_type T;
T x[VT];
const int Count = (VT < 4) ? VT : 4;
if(count >= NT * VT) {
#pragma unroll
for(int i = 0; i < Count; ++i)
x[i] = source[NT * i + tid];
#pragma unroll
for(int i = 0; i < Count; ++i)
dest[NT * i + tid] = x[i];
} else {
#pragma unroll
for(int i = 0; i < Count; ++i) {
int index = NT * i + tid;
if(index < count)
x[i] = source[NT * i + tid];
}
#pragma unroll
for(int i = 0; i < Count; ++i) {
int index = NT * i + tid;
if(index < count)
dest[index] = x[i];
}
}
if(sync) __syncthreads();
}
template
MGPU_DEVICE void DeviceMemToMemLoop(int count, InputIt source, int tid,
OutputIt dest, bool sync) {
for(int i = 0; i < count; i += 4 * NT)
DeviceMemToMem4(count - i, source + i, tid, dest + i,
false);
if(sync) __syncthreads();
}
////////////////////////////////////////////////////////////////////////////////
// Functions to copy between shared and global memory where the average case is
// to transfer NT * VT elements.
template
MGPU_DEVICE void DeviceSharedToGlobal(int count, const T* source, int tid,
OutputIt dest, bool sync) {
typedef typename std::iterator_traits::value_type T2;
#pragma unroll
for(int i = 0; i < VT; ++i) {
int index = NT * i + tid;
if(index < count) dest[index] = (T2)source[index];
}
if(sync) __syncthreads();
}
template
MGPU_DEVICE void DeviceGlobalToShared(int count, InputIt source, int tid,
T* dest, bool sync) {
T reg[VT];
DeviceGlobalToReg(count, source, tid, reg, false);
DeviceRegToShared(reg, tid, dest, sync);
}
template
MGPU_DEVICE void DeviceGlobalToShared2(int count, InputIt source, int tid,
T* dest, bool sync) {
T reg[VT1];
DeviceGlobalToReg2(count, source, tid, reg, false);
DeviceRegToShared(reg, tid, dest, sync);
}
template
MGPU_DEVICE void DeviceGlobalToSharedDefault(int count, InputIt source, int tid,
T* dest, T init, bool sync) {
T reg[VT];
DeviceGlobalToRegDefault(count, source, tid, reg, init, false);
DeviceRegToShared(reg, tid, dest, sync);
}
template
MGPU_DEVICE void DeviceGlobalToSharedDefault2(int count, InputIt data, int tid,
T* dest, T init, bool sync) {
T reg[VT1];
DeviceGlobalToRegDefault2(count, data, tid, reg, init, false);
DeviceRegToShared(reg, tid, dest, sync);
}
////////////////////////////////////////////////////////////////////////////////
template
MGPU_DEVICE void DeviceGlobalToSharedLoop(int count, InputIt source, int tid,
T* dest, bool sync) {
const int Granularity = MGPU_MIN(VT, 3);
DeviceGlobalToShared(count, source, tid, dest, false);
int offset = Granularity * NT;
if(count > offset)
DeviceGlobalToShared(count - offset,
source + offset, tid, dest + offset, false);
if(sync) __syncthreads();
/*
source += tid;
while(count > 0) {
T reg[Granularity];
#pragma unroll
for(int i = 0; i < Granularity; ++i) {
int index = NT * i + tid;
if(index < count)
reg[i] = source[NT * i];
}
DeviceRegToShared(reg, tid, dest, false);
source += Granularity * NT;
dest += Granularity * NT;
count -= Granularity * NT;
}
if(sync) __syncthreads();*/
}
template
MGPU_DEVICE void DeviceGlobalToGlobal(int count, InputIt source, int tid,
OutputIt dest, bool sync) {
typedef typename std::iterator_traits::value_type T;
T values[VT];
DeviceGlobalToReg(count, source, tid, values, false);
DeviceRegToGlobal(count, values, tid, dest, sync);
}
////////////////////////////////////////////////////////////////////////////////
// Transponse VT elements in NT threads (x) into thread-order registers (y)
// using only NT * VT / 2 elements of shared memory.
//This function definitely has a bug, don't use!!! fix TODO(erich)
template
MGPU_DEVICE void HalfSmemTranspose(const T* x, int tid, T* shared, T* y) {
printf("HalfSmemTranspose has a bug, use WAR SmemTranpose or find bug before using in production");
// Transpose the first half values (tid < NT / 2)
#pragma unroll
for(int i = 0; i <= VT / 2; ++i)
if(i < VT / 2 || tid < NT / 2)
shared[NT * i + tid] = x[i];
__syncthreads();
if(tid < NT / 2) {
#pragma unroll
for(int i = 0; i < VT; ++i)
y[i] = shared[VT * tid + i];
}
__syncthreads();
// Transpose the second half values (tid >= NT / 2)
#pragma unroll
for(int i = VT / 2; i < VT; ++i)
if(i > VT / 2 || tid >= NT / 2)
shared[NT * i - NT * VT / 2 + tid] = x[i];
__syncthreads();
if(tid >= NT / 2) {
#pragma unroll
for(int i = 0; i < VT; ++i)
y[i] = shared[VT * tid + i - NT * VT / 2];
}
__syncthreads();
}
////////////////////////////////////////////////////////////////////////////////
// Gather/scatter functions
template
MGPU_DEVICE void DeviceGather(int count, InputIt data, int indices[VT],
int tid, T* reg, bool sync) {
if(count >= NT * VT) {
#pragma unroll
for(int i = 0; i < VT; ++i)
reg[i] = data[indices[i]];
} else {
#pragma unroll
for(int i = 0; i < VT; ++i) {
int index = NT * i + tid;
if(index < count)
reg[i] = data[indices[i]];
}
}
if(sync) __syncthreads();
}
template
MGPU_DEVICE void DeviceGatherDefault(int count, InputIt data, int indices[VT],
int tid, T* reg, T identity, bool sync) {
if(count >= NT * VT) {
#pragma unroll
for(int i = 0; i < VT; ++i)
reg[i] = data[indices[i]];
} else {
#pragma unroll
for(int i = 0; i < VT; ++i) {
int index = NT * i + tid;
reg[i] = (index < count) ? data[indices[i]] : identity;
}
}
if(sync) __syncthreads();
}
template
MGPU_DEVICE void DeviceScatter(int count, const T* reg, int tid,
int indices[VT], OutputIt data, bool sync) {
if(count >= NT * VT) {
#pragma unroll
for(int i = 0; i < VT; ++i)
data[indices[i]] = reg[i];
} else {
#pragma unroll
for(int i = 0; i < VT; ++i) {
int index = NT * i + tid;
if(index < count)
data[indices[i]] = reg[i];
}
}
if(sync) __syncthreads();
}
////////////////////////////////////////////////////////////////////////////////
// Cooperative transpose functions (strided to thread order)
template
MGPU_DEVICE void DeviceThreadToShared(const T* threadReg, int tid, T* shared,
bool sync) {
if(1 & VT) {
// Odd grain size. Store as type T.
#pragma unroll
for(int i = 0; i < VT; ++i)
shared[VT * tid + i] = threadReg[i];
} else {
// Even grain size. Store as DevicePair