gitextract_pzk3dhhw/

├── .clang-format
├── .clang-tidy
├── .cmake-format.py
├── .devcontainer/
│   ├── Dockerfile
│   └── devcontainer.json
├── .dockerignore
├── .github/
│   ├── CODEOWNERS
│   ├── ISSUE_TEMPLATE/
│   │   ├── blank_issue.yml
│   │   ├── bug_report.md
│   │   ├── documention_issue.yml
│   │   ├── feature_request.yml
│   │   ├── performance_issue.yml
│   │   └── question.yml
│   ├── PULL_REQUEST_TEMPLATE/
│   │   ├── general_template.md
│   │   └── op_template.md
│   ├── actions/
│   │   ├── mac-build/
│   │   │   └── action.yml
│   │   ├── setup/
│   │   │   └── action.yml
│   │   ├── upload_oss/
│   │   │   └── action.yml
│   │   ├── upload_ssh/
│   │   │   └── action.yml
│   │   └── whl/
│   │       └── action.yml
│   ├── scripts/
│   │   ├── requirements.txt
│   │   └── set_initial_variables.py
│   └── workflows/
│       ├── canary.yml
│       ├── community_release.yml
│       ├── on_merge.yml
│       ├── pr.yml
│       ├── priv_release.yml
│       ├── release.yml
│       ├── simple.yml
│       └── test.yml
├── .gitignore
├── .lsan-suppressions
├── .mergify.yml
├── .tsan-suppressions
├── .ubsan-suppressions
├── CMakeLists.txt
├── LICENSE
├── README.md
├── ci/
│   ├── CMakeLists.txt
│   ├── build/
│   │   ├── ensure_img.py
│   │   └── make.sh
│   ├── check/
│   │   ├── clang_tidy_warnings_as_errors_on_diff
│   │   ├── lintutils.py
│   │   ├── run_clang_format.py
│   │   ├── run_clang_tidy.py
│   │   ├── run_cmake_format.py
│   │   ├── run_license_format.py
│   │   └── run_py_format.py
│   ├── clang/
│   │   └── build-llvm.sh
│   ├── conda/
│   │   ├── build-clang.sh
│   │   └── tuna.condarc
│   ├── fixed-dev-requirements.txt
│   ├── manylinux/
│   │   ├── build-gcc7-xla.sh
│   │   ├── build-gcc9.sh
│   │   └── build.sh
│   ├── requirements.txt
│   ├── reset_submodule.sh
│   ├── setup_submodule.py
│   ├── setup_submodule.sh
│   └── test/
│       ├── 1node_benchmark_test.sh
│       ├── 1node_benchmark_test_fp16.sh
│       ├── 1node_custom_op_test.sh
│       ├── 1node_model_eager_test.sh
│       ├── 1node_model_test.sh
│       ├── 1node_op_test.sh
│       ├── 2node_op_test.sh
│       ├── 2node_op_test_multi_client.sh
│       ├── CMakeLists.txt
│       ├── build_docs.sh
│       ├── distributed_run.py
│       ├── doctest.sh
│       ├── excludelist
│       ├── expensive_generic_test_multi_client.sh
│       ├── generic_test.sh
│       ├── generic_test_multi_client.sh
│       ├── ir_tests.sh
│       ├── multi_client_exception_test.sh
│       ├── multi_launch.py
│       ├── parallel_run.py
│       ├── print_stack_from_core.sh
│       ├── print_stack_in_all_dirs.sh
│       ├── resource-spec/
│       │   ├── 1x-gtx-1080.json
│       │   ├── 2x-rtx-2080.json
│       │   └── 4x-rtx-2080ti.json
│       ├── test_mock_function.sh
│       ├── test_mock_script.sh
│       ├── test_resnet50_graph_ddp.sh
│       ├── test_speed_multi_client.sh
│       └── try_install.sh
├── cmake/
│   ├── caches/
│   │   ├── ci/
│   │   │   ├── canary/
│   │   │   │   └── cuda.cmake
│   │   │   ├── cpu-asan-ubsan.cmake
│   │   │   ├── cpu-tsan.cmake
│   │   │   ├── cpu.cmake
│   │   │   ├── cuda-xla.cmake
│   │   │   ├── cuda.cmake
│   │   │   ├── gh-hosted/
│   │   │   │   ├── cpu-clang.cmake
│   │   │   │   └── cpu-gcc.cmake
│   │   │   ├── llvm/
│   │   │   │   └── cuda-75-clang.cmake
│   │   │   ├── profiler/
│   │   │   │   └── cuda.cmake
│   │   │   ├── release/
│   │   │   │   ├── cpu.cmake
│   │   │   │   ├── cu118.cmake
│   │   │   │   └── cuda.cmake
│   │   │   └── serving/
│   │   │       ├── cuda-75.cmake
│   │   │       └── openvino.cmake
│   │   ├── cn/
│   │   │   ├── cpu.cmake
│   │   │   ├── cuda.cmake
│   │   │   └── fast/
│   │   │       ├── cpu-clang.cmake
│   │   │       ├── cpu.cmake
│   │   │       ├── cuda-61-clang.cmake
│   │   │       ├── cuda-61.cmake
│   │   │       ├── cuda-75-clang.cmake
│   │   │       ├── cuda-75.cmake
│   │   │       ├── cuda-86.cmake
│   │   │       ├── mlir-cpu.cmake
│   │   │       ├── mlir-cuda-61.cmake
│   │   │       ├── mlir-cuda-75.cmake
│   │   │       ├── mlir-cuda-80.cmake
│   │   │       └── mlir-cuda-86.cmake
│   │   └── international/
│   │       ├── cpu.cmake
│   │       └── cuda.cmake
│   ├── cuda.cmake
│   ├── functional.cmake
│   ├── git_version.cmake
│   ├── oneflow-config.cmake
│   ├── oneflow.cmake
│   ├── op_schema.cmake
│   ├── platform.cmake
│   ├── proto2cpp.cmake
│   ├── pybind11.cmake
│   ├── python.cmake
│   ├── third_party/
│   │   ├── FindBFD.cmake
│   │   ├── FindBLAS.cmake
│   │   ├── FindCUDNN.cmake
│   │   ├── FindUnwind.cmake
│   │   ├── absl.cmake
│   │   ├── cares.cmake
│   │   ├── cocoapi.cmake
│   │   ├── cub.cmake
│   │   ├── cutlass.cmake
│   │   ├── eigen.cmake
│   │   ├── flash_attention.cmake
│   │   ├── flatbuffers.cmake
│   │   ├── glog.cmake
│   │   ├── googletest.cmake
│   │   ├── grpc.cmake
│   │   ├── half.cmake
│   │   ├── header_index/
│   │   │   ├── cub_headers.txt
│   │   │   ├── grpc_headers.txt
│   │   │   ├── libpng_headers.txt
│   │   │   └── opencv_headers.txt
│   │   ├── hwloc.cmake
│   │   ├── json.cmake
│   │   ├── libjpeg-turbo.cmake
│   │   ├── nccl.cmake
│   │   ├── oneDNN.cmake
│   │   ├── opencv.cmake
│   │   ├── openssl.cmake
│   │   ├── patches/
│   │   │   └── tensorflow-logging.patch
│   │   ├── protobuf.cmake
│   │   ├── re2.cmake
│   │   ├── trt_flash_attention.cmake
│   │   └── zlib.cmake
│   ├── third_party.cmake
│   ├── threading.cmake
│   └── util.cmake
├── dev-requirements.txt
├── docker/
│   ├── build/
│   │   ├── Dockerfile
│   │   ├── build-ubuntu.sh
│   │   ├── build.sh
│   │   ├── build.ubuntu.dockerfile
│   │   ├── launch.sh
│   │   └── test.sh
│   ├── ci/
│   │   ├── base/
│   │   │   └── Dockerfile
│   │   ├── fmt/
│   │   │   ├── Dockerfile
│   │   │   └── build.sh
│   │   ├── make/
│   │   │   └── Dockerfile
│   │   ├── test/
│   │   │   ├── Dockerfile
│   │   │   ├── build.sh
│   │   │   ├── launch.sh
│   │   │   └── requirements.txt
│   │   ├── test-v2/
│   │   │   ├── Dockerfile
│   │   │   ├── build.sh
│   │   │   ├── requirements.txt
│   │   │   └── sources.list
│   │   └── third_party/
│   │       └── Dockerfile
│   └── package/
│       └── manylinux/
│           ├── CentOS-Base.repo
│           ├── CentOS7-Base-163.repo
│           ├── Dockerfile
│           ├── README.md
│           ├── build_wheel.py
│           └── launch.sh
├── docs/
│   ├── Makefile
│   ├── requirements.txt
│   └── source/
│       ├── _static/
│       │   └── .gitkeep
│       ├── auto_parallel.rst
│       ├── autograd.rst
│       ├── cn/
│       │   ├── __init__.py
│       │   ├── activation.py
│       │   └── math_ops.py
│       ├── conf.py
│       ├── cuda.rst
│       ├── distributed.rst
│       ├── distributions.rst
│       ├── environment_variables.rst
│       ├── graph.rst
│       ├── hub.rst
│       ├── image.rst
│       ├── index.rst
│       ├── linalg.rst
│       ├── nn.functional.rst
│       ├── nn.init.rst
│       ├── nn.rst
│       ├── one_embedding.rst
│       ├── oneflow.rst
│       ├── optim.rst
│       ├── special.rst
│       ├── tensor.rst
│       ├── tensor_attributes.rst
│       ├── troubleshooting.md
│       ├── type_info.rst
│       ├── utils.data.rst
│       ├── utils.global_view.rst
│       └── utils.tensor.rst
├── external/
│   ├── CMakeLists.txt
│   ├── fmt/
│   │   └── CMakeLists.txt
│   ├── kineto/
│   │   └── CMakeLists.txt
│   ├── onetbb/
│   │   └── CMakeLists.txt
│   └── robin-hood-hashing/
│       └── CMakeLists.txt
├── oneflow/
│   ├── api/
│   │   ├── common/
│   │   │   ├── ir_pass.cpp
│   │   │   ├── job_build_and_infer_ctx.h
│   │   │   ├── sbp.h
│   │   │   └── variable_tensor_mgr.h
│   │   ├── cpp/
│   │   │   ├── api.h
│   │   │   ├── embedding/
│   │   │   │   ├── embedding.cpp
│   │   │   │   └── embedding.h
│   │   │   ├── env.cpp
│   │   │   ├── env.h
│   │   │   ├── env_impl.cpp
│   │   │   ├── env_impl.h
│   │   │   ├── framework/
│   │   │   │   ├── device.cpp
│   │   │   │   ├── device.h
│   │   │   │   ├── dtype.cpp
│   │   │   │   ├── dtype.h
│   │   │   │   ├── graph.cpp
│   │   │   │   ├── graph.h
│   │   │   │   ├── ivalue.cpp
│   │   │   │   ├── ivalue.h
│   │   │   │   ├── shape.cpp
│   │   │   │   ├── shape.h
│   │   │   │   ├── tensor.cpp
│   │   │   │   └── tensor.h
│   │   │   ├── framework.h
│   │   │   ├── nn/
│   │   │   │   └── functional/
│   │   │   │       ├── activation.cpp
│   │   │   │       └── activation.h
│   │   │   ├── nn.h
│   │   │   └── tests/
│   │   │       ├── api_test.cpp
│   │   │       ├── api_test.h
│   │   │       ├── graph_test.cpp
│   │   │       ├── graph_test_model/
│   │   │       │   ├── affine_no_parameter/
│   │   │       │   │   └── model.mlir
│   │   │       │   └── affine_with_parameter/
│   │   │       │       ├── model.a/
│   │   │       │       │   ├── meta
│   │   │       │       │   └── out
│   │   │       │       ├── model.b/
│   │   │       │       │   ├── meta
│   │   │       │       │   └── out
│   │   │       │       └── model.mlir
│   │   │       ├── ivalue_test.cpp
│   │   │       ├── nn_test.cpp
│   │   │       ├── one_embedding_test.cpp
│   │   │       └── tensor_test.cpp
│   │   └── python/
│   │       ├── autograd/
│   │       │   ├── autograd.cpp
│   │       │   ├── autograd_engine.cpp
│   │       │   ├── autograd_function.cpp
│   │       │   ├── autograd_function_state.cpp
│   │       │   ├── autograd_function_state.h
│   │       │   ├── autograd_mode.cpp
│   │       │   └── function_node.cpp
│   │       ├── caster/
│   │       │   ├── autograd_function_state.h
│   │       │   ├── common.h
│   │       │   ├── maybe.h
│   │       │   ├── optional.h
│   │       │   ├── size.h
│   │       │   ├── tensor.h
│   │       │   └── test.cpp
│   │       ├── deprecated.cpp
│   │       ├── dlpack/
│   │       │   ├── converter.cpp
│   │       │   ├── converter.h
│   │       │   └── dlpack.h
│   │       ├── eager/
│   │       │   └── eager.cpp
│   │       ├── env/
│   │       │   ├── env.cpp
│   │       │   └── env.h
│   │       ├── ep/
│   │       │   └── cuda_matmul_mode.cpp
│   │       ├── exception/
│   │       │   ├── exception.cpp
│   │       │   └── exception.h
│   │       ├── flags.cpp
│   │       ├── framework/
│   │       │   ├── autocast.cpp
│   │       │   ├── device.cpp
│   │       │   ├── doc.cpp
│   │       │   ├── dtype.cpp
│   │       │   ├── framework.cpp
│   │       │   ├── framework.h
│   │       │   ├── global_mode.cpp
│   │       │   ├── id_state.cpp
│   │       │   ├── id_util.cpp
│   │       │   ├── instructions_builder.cpp
│   │       │   ├── layout.cpp
│   │       │   ├── memory_format.cpp
│   │       │   ├── memory_format.h
│   │       │   ├── nn_graph.cpp
│   │       │   ├── one_embedding.cpp
│   │       │   ├── op_builder.cpp
│   │       │   ├── op_expr.cpp
│   │       │   ├── parallel_conf_util.cpp
│   │       │   ├── py_kernel_registry.cpp
│   │       │   ├── random_generator.cpp
│   │       │   ├── scope_util.cpp
│   │       │   ├── session_util.cpp
│   │       │   ├── shut_down_util.cpp
│   │       │   ├── size.cpp
│   │       │   ├── size.h
│   │       │   ├── stream.cpp
│   │       │   ├── tensor.cpp
│   │       │   ├── tensor.h
│   │       │   ├── tensor_functions.cpp
│   │       │   ├── tensor_functions_util.h
│   │       │   ├── tensor_tuple.cpp
│   │       │   ├── tensortype.cpp
│   │       │   ├── tensortype.h
│   │       │   ├── thread.cpp
│   │       │   ├── thread.h
│   │       │   ├── typeinfo.cpp
│   │       │   ├── typeinfo.h
│   │       │   └── variable_tensor_mgr.cpp
│   │       ├── functional/
│   │       │   ├── common.cpp
│   │       │   ├── common.h
│   │       │   ├── dispatch_stateful_ops.cpp
│   │       │   ├── dispatch_stateful_ops.yaml
│   │       │   ├── function_def.h
│   │       │   ├── indexing.cpp
│   │       │   ├── indexing.h
│   │       │   ├── python_arg.cpp
│   │       │   ├── python_arg.h
│   │       │   ├── python_arg_parser.cpp
│   │       │   ├── python_arg_parser.h
│   │       │   ├── python_return_types.h
│   │       │   ├── tensor_api.cpp
│   │       │   ├── tensor_api.yaml
│   │       │   ├── value_types.cpp
│   │       │   └── value_types.h
│   │       ├── gil_foreign_lock_helper.cpp
│   │       ├── init.cpp
│   │       ├── ir.cpp
│   │       ├── job_build/
│   │       │   ├── job_build_and_infer.cpp
│   │       │   ├── job_build_and_infer.h
│   │       │   └── lazy_mode.cpp
│   │       ├── multiprocessing/
│   │       │   ├── init.cpp
│   │       │   ├── object_ptr.cpp
│   │       │   ├── object_ptr.h
│   │       │   └── shared_memory.cpp
│   │       ├── numpy/
│   │       │   └── init_numpy_c_api.cpp
│   │       ├── of_api_registry.cpp
│   │       ├── of_api_registry.h
│   │       ├── profiler.cpp
│   │       ├── registry/
│   │       │   └── registry.cpp
│   │       ├── remat/
│   │       │   └── remat.cpp
│   │       ├── rpc/
│   │       │   ├── ccl.cpp
│   │       │   └── rank_group.cpp
│   │       ├── session/
│   │       │   └── session.cpp
│   │       ├── stack_getter.cpp
│   │       ├── symbol/
│   │       │   ├── job_conf_symbol.cpp
│   │       │   ├── op_conf_symbol.cpp
│   │       │   ├── placement_symbol.cpp
│   │       │   ├── sbp_symbol.cpp
│   │       │   └── scope_symbol.cpp
│   │       └── utils/
│   │           ├── dataloader.cpp
│   │           ├── tensor_utils.cpp
│   │           └── tensor_utils.h
│   ├── core/
│   │   ├── auto_parallel/
│   │   │   ├── algorithm_util.cpp
│   │   │   ├── algorithm_util.h
│   │   │   ├── auto_memory.cpp
│   │   │   ├── auto_memory.h
│   │   │   ├── binary_set.cpp
│   │   │   ├── binary_set.h
│   │   │   ├── boxing_collector.cpp
│   │   │   ├── boxing_collector.h
│   │   │   ├── sbp_collector.cpp
│   │   │   ├── sbp_collector.h
│   │   │   ├── sbp_constructor.cpp
│   │   │   ├── sbp_constructor.h
│   │   │   ├── sbp_edge.cpp
│   │   │   ├── sbp_edge.h
│   │   │   ├── sbp_graph.cpp
│   │   │   ├── sbp_graph.h
│   │   │   ├── sbp_node.cpp
│   │   │   ├── sbp_node.h
│   │   │   ├── sbp_util.cpp
│   │   │   └── sbp_util.h
│   │   ├── autograd/
│   │   │   ├── autograd_captured_tensor.h
│   │   │   ├── autograd_engine.cpp
│   │   │   ├── autograd_engine.h
│   │   │   ├── autograd_function.cpp
│   │   │   ├── autograd_function.h
│   │   │   ├── autograd_meta.cpp
│   │   │   ├── autograd_meta.h
│   │   │   ├── autograd_mode.cpp
│   │   │   ├── autograd_mode.h
│   │   │   ├── gradient_funcs/
│   │   │   │   ├── activation.cpp
│   │   │   │   ├── adaptive_avg_pool.cpp
│   │   │   │   ├── adaptive_max_pool.cpp
│   │   │   │   ├── add_n.cpp
│   │   │   │   ├── affine_grid.cpp
│   │   │   │   ├── amp_white_identity.cpp
│   │   │   │   ├── as_strided.cpp
│   │   │   │   ├── avg_pool.cpp
│   │   │   │   ├── batch_gather.cpp
│   │   │   │   ├── bias_add.cpp
│   │   │   │   ├── binary_cross_entropy.cpp
│   │   │   │   ├── binary_cross_entropy_with_logits.cpp
│   │   │   │   ├── binary_cross_entropy_with_logits_reduce_mean.cpp
│   │   │   │   ├── broadcast_binary_ops.cpp
│   │   │   │   ├── broadcast_like.cpp
│   │   │   │   ├── cast.cpp
│   │   │   │   ├── clip_by_scalar.cpp
│   │   │   │   ├── clip_by_scalar_max.cpp
│   │   │   │   ├── clip_by_scalar_min.cpp
│   │   │   │   ├── combined_margin_loss.cpp
│   │   │   │   ├── complex.cpp
│   │   │   │   ├── concat.cpp
│   │   │   │   ├── conv.cpp
│   │   │   │   ├── copy.cpp
│   │   │   │   ├── ctc_loss.cpp
│   │   │   │   ├── cublas_fused_mlp.cpp
│   │   │   │   ├── cum_ops.cpp
│   │   │   │   ├── deconv.cpp
│   │   │   │   ├── deform_conv.cpp
│   │   │   │   ├── depand.cpp
│   │   │   │   ├── det.cpp
│   │   │   │   ├── diag.cpp
│   │   │   │   ├── diagonal.cpp
│   │   │   │   ├── dim_gather.cpp
│   │   │   │   ├── dim_scatter.cpp
│   │   │   │   ├── dot.cpp
│   │   │   │   ├── dropout.cpp
│   │   │   │   ├── eager_ccl_broadcast.cpp
│   │   │   │   ├── elementwise_minimum_maximum.cpp
│   │   │   │   ├── embedding.cpp
│   │   │   │   ├── expand.cpp
│   │   │   │   ├── fake_quantization.cpp
│   │   │   │   ├── fft.cpp
│   │   │   │   ├── fill.cpp
│   │   │   │   ├── flatten.cpp
│   │   │   │   ├── flip.cpp
│   │   │   │   ├── fold.cpp
│   │   │   │   ├── fused_bias_add_dropout.cpp
│   │   │   │   ├── fused_bias_add_gelu.cpp
│   │   │   │   ├── fused_bias_add_scale_mask_softmax_dropout.cpp
│   │   │   │   ├── fused_center.cpp
│   │   │   │   ├── fused_cross_interaction.cpp
│   │   │   │   ├── fused_dot_feature_interaction.cpp
│   │   │   │   ├── fused_fast_gelu_mul.cpp
│   │   │   │   ├── fused_get_boundding_boxes_coord.cpp
│   │   │   │   ├── fused_get_ciou_diagonal_angle.cpp
│   │   │   │   ├── fused_get_ciou_result.cpp
│   │   │   │   ├── fused_get_convex_diagonal_squared.cpp
│   │   │   │   ├── fused_get_intersection_area.cpp
│   │   │   │   ├── fused_get_iou.cpp
│   │   │   │   ├── fused_glu.cpp
│   │   │   │   ├── fused_gru_cell.cpp
│   │   │   │   ├── fused_lstm_cell.cpp
│   │   │   │   ├── fused_matmul_bias.cpp
│   │   │   │   ├── fused_matmul_bias_add_relu_dropout.cpp
│   │   │   │   ├── fused_scale_mask_bias_softmax.cpp
│   │   │   │   ├── fused_scale_mask_softmax.cpp
│   │   │   │   ├── fused_scale_mask_softmax_dropout.cpp
│   │   │   │   ├── fused_scale_tril.cpp
│   │   │   │   ├── fused_scale_tril_softmax_mask_scale.cpp
│   │   │   │   ├── fused_self_attention.cpp
│   │   │   │   ├── fused_weighted_sum.cpp
│   │   │   │   ├── gather.cpp
│   │   │   │   ├── gather_nd.cpp
│   │   │   │   ├── global_cast.cpp
│   │   │   │   ├── global_to_global.cpp
│   │   │   │   ├── gradient_accumulation.cpp
│   │   │   │   ├── graph_feed_and_fetch.cpp
│   │   │   │   ├── grid_sample.cpp
│   │   │   │   ├── group_norm.cpp
│   │   │   │   ├── identity.cpp
│   │   │   │   ├── inv.cpp
│   │   │   │   ├── kl_div.cpp
│   │   │   │   ├── l2_normalize.cpp
│   │   │   │   ├── layer_norm.cpp
│   │   │   │   ├── lerp.cpp
│   │   │   │   ├── linalg_cross.cpp
│   │   │   │   ├── log_softmax.cpp
│   │   │   │   ├── masked_fill.cpp
│   │   │   │   ├── math_binary_op.cpp
│   │   │   │   ├── math_unary_op.cpp
│   │   │   │   ├── matmul.cpp
│   │   │   │   ├── matrix_vector_product.cpp
│   │   │   │   ├── max_pool.cpp
│   │   │   │   ├── max_unpool.cpp
│   │   │   │   ├── median.cpp
│   │   │   │   ├── mode.cpp
│   │   │   │   ├── narrow.cpp
│   │   │   │   ├── nll.cpp
│   │   │   │   ├── noncontiguous_binary_op.cpp
│   │   │   │   ├── normalization.cpp
│   │   │   │   ├── normalization_add_relu.cpp
│   │   │   │   ├── one_embedding_fused_lookup.cpp
│   │   │   │   ├── padding.cpp
│   │   │   │   ├── partial_fc_sample.cpp
│   │   │   │   ├── reduce_ops.cpp
│   │   │   │   ├── reduce_sum_like.cpp
│   │   │   │   ├── reshape.cpp
│   │   │   │   ├── rms_norm.cpp
│   │   │   │   ├── roi_align.cpp
│   │   │   │   ├── roll.cpp
│   │   │   │   ├── rrelu.cpp
│   │   │   │   ├── scalar_add.cpp
│   │   │   │   ├── scalar_div.cpp
│   │   │   │   ├── scalar_floordiv.cpp
│   │   │   │   ├── scalar_fmod.cpp
│   │   │   │   ├── scalar_mul.cpp
│   │   │   │   ├── scalar_pow.cpp
│   │   │   │   ├── scalar_truncdiv.cpp
│   │   │   │   ├── scaled_dot_product_attention.cpp
│   │   │   │   ├── scatter_nd.cpp
│   │   │   │   ├── select_top_n.cpp
│   │   │   │   ├── slice.cpp
│   │   │   │   ├── smooth_l1_loss.cpp
│   │   │   │   ├── softmax.cpp
│   │   │   │   ├── softmax_cross_entropy.cpp
│   │   │   │   ├── sparse_cross_entropy.cpp
│   │   │   │   ├── sparse_softmax_cross_entropy.cpp
│   │   │   │   ├── sparse_softmax_cross_entropy_ms.cpp
│   │   │   │   ├── split_like.cpp
│   │   │   │   ├── squeeze.cpp
│   │   │   │   ├── stack.cpp
│   │   │   │   ├── tensor_scalar_binary.cpp
│   │   │   │   ├── tensor_scatter_nd_update.cpp
│   │   │   │   ├── tf_pool.cpp
│   │   │   │   ├── to_contiguous.cpp
│   │   │   │   ├── transpose.cpp
│   │   │   │   ├── tril.cpp
│   │   │   │   ├── triu.cpp
│   │   │   │   ├── trunc.cpp
│   │   │   │   ├── two_stage_reduce.cpp
│   │   │   │   ├── unfold.cpp
│   │   │   │   ├── unfold_tensor.cpp
│   │   │   │   ├── unsqueeze.cpp
│   │   │   │   ├── upsample.cpp
│   │   │   │   ├── variance.cpp
│   │   │   │   ├── vector_matrix_product.cpp
│   │   │   │   └── where.cpp
│   │   │   └── higher_order_gradient_funcs/
│   │   │       ├── activation.cpp
│   │   │       ├── avg_pool.cpp
│   │   │       ├── binary_cross_entropy_loss.cpp
│   │   │       ├── binary_cross_entropy_with_logits.cpp
│   │   │       ├── binary_cross_entropy_with_logits_reduce_mean.cpp
│   │   │       ├── conv.cpp
│   │   │       ├── div.cpp
│   │   │       ├── kl_div_loss.cpp
│   │   │       ├── log_softmax.cpp
│   │   │       ├── math_unary_op.cpp
│   │   │       ├── matmul.cpp
│   │   │       ├── max_pool.cpp
│   │   │       ├── nll_loss.cpp
│   │   │       ├── pow.cpp
│   │   │       ├── scalar_pow.cpp
│   │   │       ├── slice.cpp
│   │   │       ├── smooth_l1_loss.cpp
│   │   │       └── softmax.cpp
│   │   ├── boxing/
│   │   │   ├── asymmetric_broadcast.cpp
│   │   │   ├── boxing_dividor.h
│   │   │   ├── boxing_dividor_util.cpp
│   │   │   ├── boxing_dividor_util.h
│   │   │   ├── boxing_interpreter_status.cpp
│   │   │   ├── boxing_interpreter_status.h
│   │   │   ├── ccl_boxing_function.cpp
│   │   │   ├── cuda_copy_boxing_interpreter.cpp
│   │   │   ├── eager_boxing_interpreter.cpp
│   │   │   ├── eager_boxing_interpreter.h
│   │   │   ├── eager_boxing_interpreter_mgr.cpp
│   │   │   ├── eager_boxing_interpreter_mgr.h
│   │   │   ├── eager_boxing_logger.cpp
│   │   │   ├── eager_boxing_logger.h
│   │   │   ├── flatten_hierarchy.cpp
│   │   │   ├── generic_symmetric_nd_sbp_boxing.cpp
│   │   │   ├── identity_boxing_interpreter.cpp
│   │   │   ├── naive_1_to_p_boxing.cpp
│   │   │   ├── naive_b_to_1_boxing.cpp
│   │   │   ├── naive_b_to_s_boxing.cpp
│   │   │   ├── naive_p_to_b_boxing.cpp
│   │   │   ├── naive_p_to_s_boxing.cpp
│   │   │   ├── naive_s_to_b_boxing.cpp
│   │   │   ├── naive_s_to_p_boxing.cpp
│   │   │   ├── naive_s_to_s_boxing.cpp
│   │   │   ├── nd_sbp_dim_reduce_boxing.cpp
│   │   │   ├── one_to_one_boxing.cpp
│   │   │   ├── slice_boxing_util.cpp
│   │   │   ├── slice_boxing_util.h
│   │   │   ├── symmetric_acyclic_nd_sbp_boxing.cpp
│   │   │   ├── symmetric_b_to_p_boxing.cpp
│   │   │   ├── symmetric_b_to_s_boxing.cpp
│   │   │   ├── symmetric_s_to_p_boxing.cpp
│   │   │   └── unflatten_hierarchy.cpp
│   │   ├── ccl/
│   │   │   ├── ccl.cpp
│   │   │   └── ccl.h
│   │   ├── comm_network/
│   │   │   ├── comm_network.cpp
│   │   │   ├── comm_network.h
│   │   │   ├── epoll/
│   │   │   │   ├── epoll_comm_network.cpp
│   │   │   │   ├── epoll_comm_network.h
│   │   │   │   ├── io_event_poller.cpp
│   │   │   │   ├── io_event_poller.h
│   │   │   │   ├── socket_helper.cpp
│   │   │   │   ├── socket_helper.h
│   │   │   │   ├── socket_memory_desc.h
│   │   │   │   ├── socket_message.h
│   │   │   │   ├── socket_read_helper.cpp
│   │   │   │   ├── socket_read_helper.h
│   │   │   │   ├── socket_write_helper.cpp
│   │   │   │   └── socket_write_helper.h
│   │   │   └── ibverbs/
│   │   │       ├── ibverbs.proto
│   │   │       ├── ibverbs_comm_network.cpp
│   │   │       ├── ibverbs_comm_network.h
│   │   │       ├── ibverbs_memory_desc.cpp
│   │   │       ├── ibverbs_memory_desc.h
│   │   │       ├── ibverbs_qp.cpp
│   │   │       └── ibverbs_qp.h
│   │   ├── common/
│   │   │   ├── array_ref.h
│   │   │   ├── auto_registration_factory.h
│   │   │   ├── balanced_splitter.cpp
│   │   │   ├── balanced_splitter.h
│   │   │   ├── balanced_splitter_test.cpp
│   │   │   ├── bfloat16.h
│   │   │   ├── bfloat16_math.h
│   │   │   ├── bfloat16_test.cpp
│   │   │   ├── blas.h
│   │   │   ├── blocking_counter.cpp
│   │   │   ├── blocking_counter.h
│   │   │   ├── blocking_then_busy.h
│   │   │   ├── buffer.h
│   │   │   ├── buffer_manager.h
│   │   │   ├── cached_caller.cpp
│   │   │   ├── cached_caller.h
│   │   │   ├── cblas.h
│   │   │   ├── channel.h
│   │   │   ├── channel_test.cpp
│   │   │   ├── check.cpp
│   │   │   ├── check.h
│   │   │   ├── check_level.cpp
│   │   │   ├── check_level.h
│   │   │   ├── constant.h
│   │   │   ├── container_util.h
│   │   │   ├── container_util_test.cpp
│   │   │   ├── cost_util.h
│   │   │   ├── cpp_attribute.h
│   │   │   ├── data_type.cpp
│   │   │   ├── data_type.h
│   │   │   ├── data_type.proto
│   │   │   ├── data_type_converter.h
│   │   │   ├── data_type_converter_test.cpp
│   │   │   ├── data_type_converter_test_static.h
│   │   │   ├── data_type_seq.h
│   │   │   ├── decorator.h
│   │   │   ├── decorator_test.cpp
│   │   │   ├── device.proto
│   │   │   ├── device_type.cpp
│   │   │   ├── device_type.h
│   │   │   ├── device_type.proto
│   │   │   ├── dtype_signature.h
│   │   │   ├── dtype_signature.proto
│   │   │   ├── eigen_util.h
│   │   │   ├── either_ptr.h
│   │   │   ├── env_var/
│   │   │   │   ├── bootstrap.h
│   │   │   │   ├── debug_mode.h
│   │   │   │   ├── eager.h
│   │   │   │   ├── env_var.h
│   │   │   │   ├── remat.h
│   │   │   │   ├── stream.h
│   │   │   │   └── vm.h
│   │   │   ├── error.cpp
│   │   │   ├── error.h
│   │   │   ├── error.proto
│   │   │   ├── error_util.cpp
│   │   │   ├── error_util.h
│   │   │   ├── exception.h
│   │   │   ├── flat_shape.cpp
│   │   │   ├── flat_shape.h
│   │   │   ├── foreign_lock_helper.cpp
│   │   │   ├── foreign_lock_helper.h
│   │   │   ├── function_traits.h
│   │   │   ├── hash.h
│   │   │   ├── hash_container.h
│   │   │   ├── hash_eq_trait_ptr.h
│   │   │   ├── high_order_bool.h
│   │   │   ├── just.h
│   │   │   ├── layout_standardize.h
│   │   │   ├── math_util.cpp
│   │   │   ├── math_util.h
│   │   │   ├── maybe.h
│   │   │   ├── maybe_test.cpp
│   │   │   ├── mem_util.cpp
│   │   │   ├── mem_util.h
│   │   │   ├── memory_format.proto
│   │   │   ├── meta_util.hpp
│   │   │   ├── nd_index.cpp
│   │   │   ├── nd_index.h
│   │   │   ├── nd_index_offset_helper.h
│   │   │   ├── nd_index_offset_helper_test.cpp
│   │   │   ├── not_equal_to_previous_adjacent_iterator.h
│   │   │   ├── notifier.cpp
│   │   │   ├── notifier.h
│   │   │   ├── of_unused.h
│   │   │   ├── op_args_reserved_size.h
│   │   │   ├── op_args_vector.h
│   │   │   ├── optional.h
│   │   │   ├── optional_test.cpp
│   │   │   ├── pcheck.h
│   │   │   ├── permutation_iterator.h
│   │   │   ├── platform.h
│   │   │   ├── preprocessor.h
│   │   │   ├── preprocessor_internal.h
│   │   │   ├── preprocessor_test.cpp
│   │   │   ├── process_state.h
│   │   │   ├── protobuf.cpp
│   │   │   ├── protobuf.h
│   │   │   ├── range.cpp
│   │   │   ├── range.h
│   │   │   ├── range.proto
│   │   │   ├── registry_error.cpp
│   │   │   ├── registry_error.h
│   │   │   ├── scalar.cpp
│   │   │   ├── scalar.h
│   │   │   ├── sequential.proto
│   │   │   ├── shape.cpp
│   │   │   ├── shape.h
│   │   │   ├── shape.proto
│   │   │   ├── shape_test.cpp
│   │   │   ├── shape_vec.h
│   │   │   ├── shape_view.cpp
│   │   │   ├── shape_view.h
│   │   │   ├── shared_or_scalar.h
│   │   │   ├── single_thread_obj_pool.h
│   │   │   ├── single_thread_obj_pool_test.cpp
│   │   │   ├── singleton.h
│   │   │   ├── sized_buffer_view.h
│   │   │   ├── small_vector.h
│   │   │   ├── spin_counter.cpp
│   │   │   ├── spin_counter.h
│   │   │   ├── static_check.h
│   │   │   ├── static_global.h
│   │   │   ├── steady_vector.h
│   │   │   ├── steady_vector_test.cpp
│   │   │   ├── str_util.cpp
│   │   │   ├── str_util.h
│   │   │   ├── stream_type.h
│   │   │   ├── stride.cpp
│   │   │   ├── stride.h
│   │   │   ├── switch_func.h
│   │   │   ├── symbol.h
│   │   │   ├── symbol_test.cpp
│   │   │   ├── tensor_buffer.cpp
│   │   │   ├── tensor_buffer.h
│   │   │   ├── tensor_desc.cpp
│   │   │   ├── tensor_desc.h
│   │   │   ├── tensor_meta.cpp
│   │   │   ├── tensor_meta.h
│   │   │   ├── test_util.h
│   │   │   ├── thread_local_guard.h
│   │   │   ├── thread_local_guard_test.cpp
│   │   │   ├── throw.h
│   │   │   ├── to_string.h
│   │   │   ├── tuple_hash.h
│   │   │   ├── type_traits.h
│   │   │   ├── util.cpp
│   │   │   ├── util.h
│   │   │   ├── wrap_dim_utils.h
│   │   │   └── zero_only_zip.h
│   │   ├── control/
│   │   │   ├── bootstrap_client.h
│   │   │   ├── bootstrap_server.h
│   │   │   ├── control.proto
│   │   │   ├── ctrl_bootstrap.cpp
│   │   │   ├── ctrl_bootstrap.h
│   │   │   ├── ctrl_bootstrap.proto
│   │   │   ├── ctrl_call.h
│   │   │   ├── ctrl_client.cpp
│   │   │   ├── ctrl_client.h
│   │   │   ├── ctrl_server.cpp
│   │   │   ├── ctrl_server.h
│   │   │   ├── ctrl_service.cpp
│   │   │   ├── ctrl_service.h
│   │   │   ├── ctrl_test.cpp
│   │   │   ├── ctrl_util.cpp
│   │   │   ├── ctrl_util.h
│   │   │   ├── global_process_ctx.h
│   │   │   ├── host_list_bootstrap_client.cpp
│   │   │   ├── host_list_bootstrap_client.h
│   │   │   ├── host_list_bootstrap_server.cpp
│   │   │   ├── host_list_bootstrap_server.h
│   │   │   ├── rank_info_bootstrap_client.cpp
│   │   │   ├── rank_info_bootstrap_client.h
│   │   │   ├── rank_info_bootstrap_server.cpp
│   │   │   ├── rank_info_bootstrap_server.h
│   │   │   ├── rpc_client.cpp
│   │   │   ├── rpc_client.h
│   │   │   ├── rpc_server.cpp
│   │   │   ├── rpc_server.h
│   │   │   └── worker_process_info.proto
│   │   ├── cuda/
│   │   │   ├── atomic.cuh
│   │   │   ├── elementwise.cuh
│   │   │   ├── layer_norm.cuh
│   │   │   ├── rms_norm.cuh
│   │   │   ├── softmax.cuh
│   │   │   └── unique.cuh
│   │   ├── device/
│   │   │   ├── cuda_pseudo_bfloat16.h
│   │   │   ├── cuda_pseudo_half.h
│   │   │   ├── cuda_util.cpp
│   │   │   ├── cuda_util.h
│   │   │   ├── cudnn_conv_util.cpp
│   │   │   ├── cudnn_conv_util.h
│   │   │   ├── cudnn_util.cpp
│   │   │   ├── cudnn_util.h
│   │   │   ├── device_id.cpp
│   │   │   ├── device_id.h
│   │   │   ├── ep_based_event_record.h
│   │   │   ├── event_record.h
│   │   │   ├── nccl_util.cpp
│   │   │   └── nccl_util.h
│   │   ├── eager/
│   │   │   ├── call_context.cpp
│   │   │   ├── call_context.h
│   │   │   ├── dev_vm_dep_object_consume_mode.h
│   │   │   ├── eager_blob_object.cpp
│   │   │   ├── eager_blob_object.h
│   │   │   ├── local_dep_object.cpp
│   │   │   ├── local_dep_object.h
│   │   │   ├── tensor_storage.cpp
│   │   │   └── tensor_storage.h
│   │   ├── embedding/
│   │   │   ├── cache.cpp
│   │   │   ├── cache.h
│   │   │   ├── cache_test.cpp
│   │   │   ├── cached_key_value_store.cu
│   │   │   ├── cached_key_value_store.h
│   │   │   ├── embedding_manager.cpp
│   │   │   ├── embedding_manager.h
│   │   │   ├── full_cache.cu
│   │   │   ├── full_cache.h
│   │   │   ├── hash_functions.cuh
│   │   │   ├── key_value_store.h
│   │   │   ├── key_value_store_options.h
│   │   │   ├── key_value_store_test.cpp
│   │   │   ├── kv_iterator.h
│   │   │   ├── lru_cache.cu
│   │   │   ├── lru_cache.h
│   │   │   ├── mock_key_value_store.cu
│   │   │   ├── mock_key_value_store.h
│   │   │   ├── persistent_table.cpp
│   │   │   ├── persistent_table.h
│   │   │   ├── persistent_table_key_value_store.cu
│   │   │   ├── persistent_table_key_value_store.h
│   │   │   └── posix_file.h
│   │   ├── ep/
│   │   │   ├── common/
│   │   │   │   ├── active_device_guard.cpp
│   │   │   │   ├── device.cpp
│   │   │   │   ├── device_manager_registry.cpp
│   │   │   │   ├── onednn.h
│   │   │   │   └── primitive/
│   │   │   │       ├── add.cpp
│   │   │   │       ├── batch_matmul.cpp
│   │   │   │       ├── binary_functor.h
│   │   │   │       ├── broadcast_elementwise_binary.h
│   │   │   │       ├── broadcast_elementwise_unary.h
│   │   │   │       ├── broadcast_matmul.h
│   │   │   │       ├── broadcast_simplify_dims_test.cpp
│   │   │   │       ├── constant_pad.h
│   │   │   │       ├── copy_nd.h
│   │   │   │       ├── elementwise_unary.h
│   │   │   │       ├── matmul.cpp
│   │   │   │       ├── permute.h
│   │   │   │       ├── permute_impl.h
│   │   │   │       ├── permute_test.cpp
│   │   │   │       ├── unary_functor.h
│   │   │   │       ├── util.h
│   │   │   │       └── where.h
│   │   │   ├── cpu/
│   │   │   │   ├── cpu_device.cpp
│   │   │   │   ├── cpu_device.h
│   │   │   │   ├── cpu_device_manager.cpp
│   │   │   │   ├── cpu_device_manager.h
│   │   │   │   ├── cpu_device_manager_factory.cpp
│   │   │   │   ├── cpu_event.cpp
│   │   │   │   ├── cpu_event.h
│   │   │   │   ├── cpu_random_generator.cpp
│   │   │   │   ├── cpu_random_generator.h
│   │   │   │   ├── cpu_stream.cpp
│   │   │   │   ├── cpu_stream.h
│   │   │   │   └── primitive/
│   │   │   │       ├── add.cpp
│   │   │   │       ├── binary_functor.h
│   │   │   │       ├── broadcast_elementwise_binary.cpp
│   │   │   │       ├── broadcast_elementwise_unary.cpp
│   │   │   │       ├── broadcast_matmul.cpp
│   │   │   │       ├── cast.cpp
│   │   │   │       ├── constant_pad.cpp
│   │   │   │       ├── copy_nd.cpp
│   │   │   │       ├── elementwise_unary.cpp
│   │   │   │       ├── fill.cpp
│   │   │   │       ├── memcpy.cpp
│   │   │   │       ├── memset.cpp
│   │   │   │       ├── permute.cpp
│   │   │   │       ├── softmax.cpp
│   │   │   │       ├── softmax_backward.cpp
│   │   │   │       ├── tensor_fill.cpp
│   │   │   │       ├── type_seq.h
│   │   │   │       ├── unary_functor.h
│   │   │   │       └── where.cpp
│   │   │   ├── cuda/
│   │   │   │   ├── cuda_device.cpp
│   │   │   │   ├── cuda_device.h
│   │   │   │   ├── cuda_device_manager.cpp
│   │   │   │   ├── cuda_device_manager.h
│   │   │   │   ├── cuda_device_manager_factory.cpp
│   │   │   │   ├── cuda_event.cpp
│   │   │   │   ├── cuda_event.h
│   │   │   │   ├── cuda_matmul_mode.cpp
│   │   │   │   ├── cuda_matmul_mode.h
│   │   │   │   ├── cuda_random_generator.cpp
│   │   │   │   ├── cuda_random_generator.h
│   │   │   │   ├── cuda_stream.cpp
│   │   │   │   ├── cuda_stream.h
│   │   │   │   └── primitive/
│   │   │   │       ├── add.cu
│   │   │   │       ├── binary_functor.cuh
│   │   │   │       ├── broadcast_elementwise_binary.cu
│   │   │   │       ├── broadcast_elementwise_binary.cuh
│   │   │   │       ├── broadcast_elementwise_binary_activation_grad_0.cu
│   │   │   │       ├── broadcast_elementwise_binary_activation_grad_1.cu
│   │   │   │       ├── broadcast_elementwise_binary_activation_grad_2.cu
│   │   │   │       ├── broadcast_elementwise_binary_bitwise.cu
│   │   │   │       ├── broadcast_elementwise_binary_comparision_0.cu
│   │   │   │       ├── broadcast_elementwise_binary_comparision_1.cu
│   │   │   │       ├── broadcast_elementwise_binary_comparision_complex.cu
│   │   │   │       ├── broadcast_elementwise_binary_logical.cu
│   │   │   │       ├── broadcast_elementwise_binary_math_0.cu
│   │   │   │       ├── broadcast_elementwise_binary_math_1.cu
│   │   │   │       ├── broadcast_elementwise_binary_math_2.cu
│   │   │   │       ├── broadcast_elementwise_binary_math_complex.cu
│   │   │   │       ├── broadcast_elementwise_unary.cu
│   │   │   │       ├── broadcast_matmul.cpp
│   │   │   │       ├── cast.cu
│   │   │   │       ├── constant_pad.cu
│   │   │   │       ├── copy_nd.cu
│   │   │   │       ├── elementwise_unary.cu
│   │   │   │       ├── fill.cu
│   │   │   │       ├── math_elementwise_unary_math_grad_0.cu
│   │   │   │       ├── math_elementwise_unary_math_grad_1.cu
│   │   │   │       ├── math_elementwise_unary_math_grad_2.cu
│   │   │   │       ├── math_elementwise_unary_math_grad_3.cu
│   │   │   │       ├── math_elementwise_unary_math_grad_complex.cu
│   │   │   │       ├── memcpy.cpp
│   │   │   │       ├── memset.cpp
│   │   │   │       ├── permute.cu
│   │   │   │       ├── softmax.cu
│   │   │   │       ├── softmax_backward.cu
│   │   │   │       ├── tensor_fill.cu
│   │   │   │       ├── type_seq.h
│   │   │   │       ├── unary_functor.cuh
│   │   │   │       └── where.cu
│   │   │   ├── include/
│   │   │   │   ├── active_device_guard.h
│   │   │   │   ├── allocation_options.h
│   │   │   │   ├── device.h
│   │   │   │   ├── device_manager.h
│   │   │   │   ├── device_manager_factory.h
│   │   │   │   ├── device_manager_registry.h
│   │   │   │   ├── event.h
│   │   │   │   ├── primitive/
│   │   │   │   │   ├── add.h
│   │   │   │   │   ├── batch_matmul.h
│   │   │   │   │   ├── binary_op.h
│   │   │   │   │   ├── blas.h
│   │   │   │   │   ├── broadcast_elementwise_binary.h
│   │   │   │   │   ├── broadcast_elementwise_unary.h
│   │   │   │   │   ├── broadcast_matmul.h
│   │   │   │   │   ├── cast.h
│   │   │   │   │   ├── constant_pad.h
│   │   │   │   │   ├── copy_nd.h
│   │   │   │   │   ├── elementwise_unary.h
│   │   │   │   │   ├── fast_integer_math.h
│   │   │   │   │   ├── fill.h
│   │   │   │   │   ├── log_softmax.h
│   │   │   │   │   ├── log_softmax_backward.h
│   │   │   │   │   ├── matmul.h
│   │   │   │   │   ├── memcpy.h
│   │   │   │   │   ├── memset.h
│   │   │   │   │   ├── one_hot.h
│   │   │   │   │   ├── permute.h
│   │   │   │   │   ├── primitive.h
│   │   │   │   │   ├── softmax.h
│   │   │   │   │   ├── softmax_backward.h
│   │   │   │   │   ├── tensor_fill.h
│   │   │   │   │   ├── unary_op.h
│   │   │   │   │   └── where.h
│   │   │   │   ├── random_generator.h
│   │   │   │   └── stream.h
│   │   │   └── test/
│   │   │       ├── primitive/
│   │   │       │   ├── add_test.cpp
│   │   │       │   ├── batch_matmul_test.cpp
│   │   │       │   ├── binary_test.cpp
│   │   │       │   ├── broadcast_matmul_test.cpp
│   │   │       │   ├── cast_test.cpp
│   │   │       │   ├── constant_pad_test.cpp
│   │   │       │   ├── copy_nd_test.cpp
│   │   │       │   ├── elementwise_unary_test.cpp
│   │   │       │   ├── fill_test.cpp
│   │   │       │   ├── matmul_test.cpp
│   │   │       │   ├── memcpy_test.cpp
│   │   │       │   ├── memset_test.cpp
│   │   │       │   ├── permute_test.cpp
│   │   │       │   ├── primitive_test.h
│   │   │       │   ├── softmax_backward_test.cpp
│   │   │       │   ├── softmax_test.cpp
│   │   │       │   ├── unary_test.cpp
│   │   │       │   └── where_test.cpp
│   │   │       └── test_util.h
│   │   ├── framework/
│   │   │   ├── arg_tuple.cpp
│   │   │   ├── arg_tuple.h
│   │   │   ├── attr_map.cpp
│   │   │   ├── attr_map.h
│   │   │   ├── attr_map_test.cpp
│   │   │   ├── attr_value.cpp
│   │   │   ├── attr_value.h
│   │   │   ├── attr_value_accessor.cpp
│   │   │   ├── attr_value_accessor.h
│   │   │   ├── auto_random_generator.cpp
│   │   │   ├── auto_random_generator.h
│   │   │   ├── autocast.cpp
│   │   │   ├── autocast.h
│   │   │   ├── compute_complexity_fn_context.h
│   │   │   ├── config_def.cpp
│   │   │   ├── config_def.h
│   │   │   ├── config_def.proto
│   │   │   ├── consistency_check.cpp
│   │   │   ├── consistency_check.h
│   │   │   ├── device.cpp
│   │   │   ├── device.h
│   │   │   ├── dtype.cpp
│   │   │   ├── dtype.h
│   │   │   ├── eager_util.h
│   │   │   ├── framework.h
│   │   │   ├── get_nd_sbp_signature_list_context.h
│   │   │   ├── global_param_grad_sync_mode.cpp
│   │   │   ├── global_param_grad_sync_mode.h
│   │   │   ├── global_tensor_infer_cache.cpp
│   │   │   ├── global_tensor_infer_cache.h
│   │   │   ├── id_util.cpp
│   │   │   ├── id_util.h
│   │   │   ├── infer_nd_sbp_fn_context.h
│   │   │   ├── infer_output_blob_time_shape_fn_context.h
│   │   │   ├── infer_util.cpp
│   │   │   ├── infer_util.h
│   │   │   ├── instructions_builder.cpp
│   │   │   ├── instructions_builder.h
│   │   │   ├── layout.cpp
│   │   │   ├── layout.h
│   │   │   ├── load_library.cpp
│   │   │   ├── load_library.h
│   │   │   ├── local_tensor_infer_cache.cpp
│   │   │   ├── local_tensor_infer_cache.h
│   │   │   ├── multi_client_session_context.cpp
│   │   │   ├── multi_client_session_context.h
│   │   │   ├── multi_thread.cpp
│   │   │   ├── multi_thread.h
│   │   │   ├── mutable_attr_map.h
│   │   │   ├── nd_sbp.cpp
│   │   │   ├── nd_sbp.h
│   │   │   ├── nn_graph.cpp
│   │   │   ├── nn_graph.h
│   │   │   ├── nn_graph_if.h
│   │   │   ├── op_builder.cpp
│   │   │   ├── op_builder.h
│   │   │   ├── op_definition.h
│   │   │   ├── op_expr.cpp
│   │   │   ├── op_expr.h
│   │   │   ├── op_expr_grad_function.cpp
│   │   │   ├── op_expr_grad_function.h
│   │   │   ├── op_interpreter/
│   │   │   │   ├── dispatch_frame.cpp
│   │   │   │   ├── dispatch_frame.h
│   │   │   │   ├── eager_global_op_interpreter.cpp
│   │   │   │   ├── eager_local_op_interpreter.cpp
│   │   │   │   ├── eager_local_op_interpreter.h
│   │   │   │   ├── lazy_op_interpreter.cpp
│   │   │   │   ├── lazy_op_interpreter.h
│   │   │   │   ├── op_interpreter.cpp
│   │   │   │   ├── op_interpreter_util.cpp
│   │   │   │   └── op_interpreter_util.h
│   │   │   ├── op_interpreter.h
│   │   │   ├── op_kernel.cpp
│   │   │   ├── op_kernel.h
│   │   │   ├── op_kernel_infer_cache.cpp
│   │   │   ├── op_kernel_infer_cache.h
│   │   │   ├── ordered_string_list.h
│   │   │   ├── parallel_conf_util.cpp
│   │   │   ├── parallel_conf_util.h
│   │   │   ├── parallel_conf_util_test.cpp
│   │   │   ├── placed_nd_sbp.cpp
│   │   │   ├── placed_nd_sbp.h
│   │   │   ├── placement_sbp_util.cpp
│   │   │   ├── placement_sbp_util.h
│   │   │   ├── placement_sbp_util_test.cpp
│   │   │   ├── placement_utils.cpp
│   │   │   ├── placement_utils.h
│   │   │   ├── random_generator.cpp
│   │   │   ├── random_generator.h
│   │   │   ├── rank_group_rpc_util.cpp
│   │   │   ├── rank_group_rpc_util.h
│   │   │   ├── saved_tensor_hooks.h
│   │   │   ├── sbp_context.cpp
│   │   │   ├── sbp_context.h
│   │   │   ├── sbp_infer_util.cpp
│   │   │   ├── sbp_infer_util.h
│   │   │   ├── sbp_infer_util_test.cpp
│   │   │   ├── scope_util.cpp
│   │   │   ├── scope_util.h
│   │   │   ├── session_util.cpp
│   │   │   ├── session_util.h
│   │   │   ├── shut_down_util.cpp
│   │   │   ├── shut_down_util.h
│   │   │   ├── stream.cpp
│   │   │   ├── stream.h
│   │   │   ├── stream_allocator_is_pinned.h
│   │   │   ├── stream_get_stream_type_name.h
│   │   │   ├── stream_guard.cpp
│   │   │   ├── stream_guard.h
│   │   │   ├── stream_is_comm_net_stream.h
│   │   │   ├── stream_mgr.cpp
│   │   │   ├── stream_mgr.h
│   │   │   ├── stream_need_soft_sync.h
│   │   │   ├── stream_on_independent_thread.h
│   │   │   ├── stream_set.cpp
│   │   │   ├── stream_set.h
│   │   │   ├── stream_support_stream_wait.h
│   │   │   ├── symbol_storage_util.cpp
│   │   │   ├── symbol_storage_util.h
│   │   │   ├── sync_symbol_global_tensor_meta.cpp
│   │   │   ├── sync_symbol_global_tensor_meta.h
│   │   │   ├── sync_symbol_nd_sbp.cpp
│   │   │   ├── sync_symbol_nd_sbp.h
│   │   │   ├── sync_symbol_parallel_desc.cpp
│   │   │   ├── sync_symbol_parallel_desc.h
│   │   │   ├── synced_symbol_map.cpp
│   │   │   ├── synced_symbol_map.h
│   │   │   ├── tensor.cpp
│   │   │   ├── tensor.h
│   │   │   ├── tensor_arg.cpp
│   │   │   ├── tensor_arg.h
│   │   │   ├── tensor_global_id.cpp
│   │   │   ├── tensor_global_id.h
│   │   │   ├── tensor_impl.cpp
│   │   │   ├── tensor_impl.h
│   │   │   ├── tensor_methods.cpp
│   │   │   ├── tensor_methods.h
│   │   │   ├── tensor_name_scope.cpp
│   │   │   ├── tensor_name_scope.h
│   │   │   ├── tensor_rpc_util.cpp
│   │   │   ├── tensor_rpc_util.h
│   │   │   ├── tensor_storage.cpp
│   │   │   ├── tensor_storage.h
│   │   │   ├── tensor_tuple.cpp
│   │   │   ├── tensor_tuple.h
│   │   │   ├── tensor_util.cpp
│   │   │   ├── tensor_util.h
│   │   │   ├── to_string.cpp
│   │   │   ├── to_string.h
│   │   │   ├── transport_token.cpp
│   │   │   ├── transport_token.h
│   │   │   ├── transport_util.cpp
│   │   │   ├── transport_util.h
│   │   │   ├── user_op_attr.proto
│   │   │   ├── user_op_conf.cpp
│   │   │   ├── user_op_conf.h
│   │   │   ├── user_op_conf.proto
│   │   │   ├── user_op_def.cpp
│   │   │   ├── user_op_def.h
│   │   │   ├── user_op_def.proto
│   │   │   ├── user_op_hob.h
│   │   │   ├── user_op_kernel_registry.cpp
│   │   │   ├── user_op_kernel_registry.h
│   │   │   ├── user_op_registry.cpp
│   │   │   ├── user_op_registry.h
│   │   │   ├── user_op_registry_manager.cpp
│   │   │   ├── user_op_registry_manager.h
│   │   │   ├── user_op_tensor.h
│   │   │   ├── util.h
│   │   │   ├── variable_meta_info.proto
│   │   │   ├── variable_tensor_mgr.cpp
│   │   │   └── variable_tensor_mgr.h
│   │   ├── functional/
│   │   │   ├── function_library.h
│   │   │   ├── functional.h
│   │   │   ├── functional_api.yaml
│   │   │   ├── impl/
│   │   │   │   ├── activation_functor.cpp
│   │   │   │   ├── array_functor.cpp
│   │   │   │   ├── binary_functor.cpp
│   │   │   │   ├── binary_functor.h
│   │   │   │   ├── binary_grad_functor.cpp
│   │   │   │   ├── comm_functor.cpp
│   │   │   │   ├── common.cpp
│   │   │   │   ├── common.h
│   │   │   │   ├── dataset_functor.cpp
│   │   │   │   ├── eye_functor.cpp
│   │   │   │   ├── fused_attention_functor.cpp
│   │   │   │   ├── global_cast.cpp
│   │   │   │   ├── gradient_accumulation_functor.cpp
│   │   │   │   ├── higher_derivative_functor.cpp
│   │   │   │   ├── linalg_functor.cpp
│   │   │   │   ├── math_functor.cpp
│   │   │   │   ├── nn_functor.cpp
│   │   │   │   ├── nn_grad_functor.cpp
│   │   │   │   ├── quantization.cpp
│   │   │   │   ├── random_functor.cpp
│   │   │   │   ├── rnn_functor.cpp
│   │   │   │   ├── slice_boxing_functor.cpp
│   │   │   │   ├── test_functor.cpp
│   │   │   │   ├── unary_functor.cpp
│   │   │   │   ├── unary_functor.h
│   │   │   │   └── util_ops_functor.cpp
│   │   │   ├── packed_functor.h
│   │   │   ├── sequence_function.h
│   │   │   ├── tensor_index.cpp
│   │   │   ├── tensor_index.h
│   │   │   ├── tensor_processor.cpp
│   │   │   └── tensor_processor.h
│   │   ├── graph/
│   │   │   ├── boxing/
│   │   │   │   ├── b21_sub_task_graph_builder.cpp
│   │   │   │   ├── b21_sub_task_graph_builder.h
│   │   │   │   ├── boxing_logger.cpp
│   │   │   │   ├── boxing_logger.h
│   │   │   │   ├── ccl_sub_task_graph_builder.cpp
│   │   │   │   ├── ccl_sub_task_graph_builder.h
│   │   │   │   ├── chain_sub_task_graph_builder.cpp
│   │   │   │   ├── chain_sub_task_graph_builder.h
│   │   │   │   ├── collective_boxing.proto
│   │   │   │   ├── collective_boxing_sub_task_graph_builder.cpp
│   │   │   │   ├── collective_boxing_sub_task_graph_builder.h
│   │   │   │   ├── collective_boxing_util.cpp
│   │   │   │   ├── collective_boxing_util.h
│   │   │   │   ├── fallback_to_cpu_slice_boxing_sub_task_graph_builder.cpp
│   │   │   │   ├── fallback_to_cpu_slice_boxing_sub_task_graph_builder.h
│   │   │   │   ├── hierarchical_sub_task_graph_builder.h
│   │   │   │   ├── hierarchical_sub_task_graph_builder_impl.cpp
│   │   │   │   ├── hierarchical_sub_task_graph_builder_impl.h
│   │   │   │   ├── hierarchical_sub_task_graph_builder_util.cpp
│   │   │   │   ├── hierarchical_sub_task_graph_builder_util.h
│   │   │   │   ├── naive_b2b_sub_task_graph_builder.cpp
│   │   │   │   ├── naive_b2b_sub_task_graph_builder.h
│   │   │   │   ├── naive_b2p_sub_task_graph_builder.cpp
│   │   │   │   ├── naive_b2p_sub_task_graph_builder.h
│   │   │   │   ├── one_to_one_sub_task_graph_builder.cpp
│   │   │   │   ├── one_to_one_sub_task_graph_builder.h
│   │   │   │   ├── slice_boxing_sub_task_graph_builder.cpp
│   │   │   │   ├── slice_boxing_sub_task_graph_builder.h
│   │   │   │   ├── sub_task_graph_builder.h
│   │   │   │   ├── sub_task_graph_builder_context.cpp
│   │   │   │   ├── sub_task_graph_builder_context.h
│   │   │   │   ├── sub_task_graph_builder_status_util.cpp
│   │   │   │   ├── sub_task_graph_builder_status_util.h
│   │   │   │   ├── sub_task_graph_builder_util.cpp
│   │   │   │   └── sub_task_graph_builder_util.h
│   │   │   ├── boxing_identity_task_node.cpp
│   │   │   ├── boxing_identity_task_node.h
│   │   │   ├── boxing_task_graph.proto
│   │   │   ├── boxing_zeros_task_node.cpp
│   │   │   ├── boxing_zeros_task_node.h
│   │   │   ├── collective_boxing_pack_task_node.cpp
│   │   │   ├── collective_boxing_pack_task_node.h
│   │   │   ├── collective_boxing_task_node.cpp
│   │   │   ├── collective_boxing_task_node.h
│   │   │   ├── collective_boxing_unpack_task_node.cpp
│   │   │   ├── collective_boxing_unpack_task_node.h
│   │   │   ├── compute_task_node.cpp
│   │   │   ├── compute_task_node.h
│   │   │   ├── copy_task_node.cpp
│   │   │   ├── copy_task_node.h
│   │   │   ├── exec_graph.cpp
│   │   │   ├── exec_graph.h
│   │   │   ├── exec_sequence.proto
│   │   │   ├── fake_consumed_regst_provider.h
│   │   │   ├── graph.h
│   │   │   ├── inplace_lbi_graph.cpp
│   │   │   ├── inplace_lbi_graph.h
│   │   │   ├── inplace_regst_graph.cpp
│   │   │   ├── inplace_regst_graph.h
│   │   │   ├── nccl_send_recv_boxing_task_node.cpp
│   │   │   ├── nccl_send_recv_boxing_task_node.h
│   │   │   ├── node.cpp
│   │   │   ├── node.h
│   │   │   ├── normal_forward_compute_task_node.h
│   │   │   ├── op_graph.cpp
│   │   │   ├── op_graph.h
│   │   │   ├── plan_task_graph.cpp
│   │   │   ├── plan_task_graph.h
│   │   │   ├── slice_boxing_task_node.cpp
│   │   │   ├── slice_boxing_task_node.h
│   │   │   ├── straighten_nodes.cpp
│   │   │   ├── straighten_nodes.h
│   │   │   ├── stream_id.cpp
│   │   │   ├── stream_id.h
│   │   │   ├── stream_index_generator.cpp
│   │   │   ├── stream_index_generator.h
│   │   │   ├── task_edge.proto
│   │   │   ├── task_graph.cpp
│   │   │   ├── task_graph.h
│   │   │   ├── task_graph_rebuild_ctx.cpp
│   │   │   ├── task_graph_rebuild_ctx.h
│   │   │   ├── task_id.cpp
│   │   │   ├── task_id.h
│   │   │   ├── task_id_generator.cpp
│   │   │   ├── task_id_generator.h
│   │   │   ├── task_node.cpp
│   │   │   ├── task_node.h
│   │   │   ├── task_stream_id.h
│   │   │   ├── task_stream_index_manager.cpp
│   │   │   ├── task_stream_index_manager.h
│   │   │   ├── task_type_visitor.h
│   │   │   ├── transport_task_node.cpp
│   │   │   └── transport_task_node.h
│   │   ├── graph_impl/
│   │   │   ├── acc_compute_task_node.cpp
│   │   │   ├── acc_ctrl_tick_compute_task_node.cpp
│   │   │   ├── acc_tick_compute_task_node.cpp
│   │   │   ├── callback_notify_compute_task_node.cpp
│   │   │   ├── case_compute_task_node.cpp
│   │   │   ├── critical_section_wait_compute_task_node.cpp
│   │   │   ├── decode_h2d_compute_task_node.cpp
│   │   │   ├── device_tick_compute_task_node.cpp
│   │   │   ├── distribute_concat_compute_task_node.cpp
│   │   │   ├── distribute_split_compute_task_node.cpp
│   │   │   ├── dst_subset_tick_compute_task_node.cpp
│   │   │   ├── esac_compute_task_node.cpp
│   │   │   ├── normal_forward_compute_task_node.cpp
│   │   │   ├── pack_compute_task_node.cpp
│   │   │   ├── reentrant_lock_compute_task_node.cpp
│   │   │   ├── repeat_compute_task_node.cpp
│   │   │   ├── source_tick_compute_task_node.cpp
│   │   │   ├── src_subset_tick_compute_task_node.cpp
│   │   │   ├── ssp_variable_proxy_task_node.cpp
│   │   │   ├── tick_compute_task_node.cpp
│   │   │   ├── unpack_compute_task_node.cpp
│   │   │   └── wait_and_send_ids_compute_task_node.cpp
│   │   ├── hardware/
│   │   │   ├── basic_device_descriptor_list.cpp
│   │   │   ├── basic_device_descriptor_list.h
│   │   │   ├── cuda_device_descriptor.cpp
│   │   │   ├── cuda_device_descriptor.h
│   │   │   ├── cuda_device_descriptor_class.cpp
│   │   │   ├── device_descriptor.h
│   │   │   ├── device_descriptor_class.cpp
│   │   │   ├── device_descriptor_class.h
│   │   │   ├── device_descriptor_list.h
│   │   │   ├── net_ib_device_descriptor.cpp
│   │   │   ├── net_ib_device_descriptor.h
│   │   │   ├── net_ib_device_descriptor_class.cpp
│   │   │   ├── net_socket_device_descriptor.cpp
│   │   │   ├── net_socket_device_descriptor.h
│   │   │   ├── net_socket_device_descriptor_class.cpp
│   │   │   ├── node_device_descriptor.cpp
│   │   │   ├── node_device_descriptor.h
│   │   │   ├── node_device_descriptor_manager.cpp
│   │   │   ├── node_device_descriptor_manager.h
│   │   │   ├── topology_descriptor.cpp
│   │   │   └── topology_descriptor.h
│   │   ├── intrusive/
│   │   │   ├── README.md
│   │   │   ├── base.h
│   │   │   ├── cpp_attribute.h
│   │   │   ├── dss.h
│   │   │   ├── dss_test.cpp
│   │   │   ├── flat_msg.h
│   │   │   ├── flat_msg_test.cpp
│   │   │   ├── flat_msg_view.h
│   │   │   ├── flat_msg_view_test.cpp
│   │   │   ├── for_each.h
│   │   │   ├── force_standard_layout.h
│   │   │   ├── force_standard_layout_test.cpp
│   │   │   ├── head_free_list.h
│   │   │   ├── head_free_list_test.cpp
│   │   │   ├── intrusive.h
│   │   │   ├── intrusive_core_test.cpp
│   │   │   ├── list.h
│   │   │   ├── list_hook.h
│   │   │   ├── list_hook_test.cpp
│   │   │   ├── list_test.cpp
│   │   │   ├── mutexed_list.h
│   │   │   ├── object_pool.h
│   │   │   ├── object_pool_test.cpp
│   │   │   ├── ref.h
│   │   │   ├── reflective.h
│   │   │   ├── shared_ptr.h
│   │   │   ├── skiplist.h
│   │   │   ├── skiplist_hook.h
│   │   │   ├── skiplist_hook_test.cpp
│   │   │   ├── skiplist_test.cpp
│   │   │   ├── static_counter.h
│   │   │   ├── static_counter_test.cpp
│   │   │   ├── struct_traits.h
│   │   │   └── struct_traits_test.cpp
│   │   ├── ipc/
│   │   │   ├── shared_memory.cpp
│   │   │   └── shared_memory.h
│   │   ├── job/
│   │   │   ├── blob_lifetime_signature.proto
│   │   │   ├── checkpointing_config_def.cpp
│   │   │   ├── cluster_instruction.cpp
│   │   │   ├── cluster_instruction.h
│   │   │   ├── cluster_instruction.proto
│   │   │   ├── collective_boxing/
│   │   │   │   ├── coordinator.h
│   │   │   │   ├── executor.cpp
│   │   │   │   ├── executor.h
│   │   │   │   ├── executor_backend.h
│   │   │   │   ├── executor_backend_manager.cpp
│   │   │   │   ├── executor_backend_manager.h
│   │   │   │   ├── nccl_executor_backend.cu
│   │   │   │   ├── request_store.cpp
│   │   │   │   ├── request_store.h
│   │   │   │   ├── runtime_request_info.h
│   │   │   │   ├── scheduler.cpp
│   │   │   │   ├── scheduler.h
│   │   │   │   ├── static_group_coordinator.cpp
│   │   │   │   └── static_group_coordinator.h
│   │   │   ├── compile_mode.cpp
│   │   │   ├── compile_mode.h
│   │   │   ├── compiler.cpp
│   │   │   ├── compiler.h
│   │   │   ├── critical_section.proto
│   │   │   ├── critical_section_desc.cpp
│   │   │   ├── critical_section_desc.h
│   │   │   ├── critical_section_instance.h
│   │   │   ├── distribute_hirarchy.proto
│   │   │   ├── dlnet_conf.proto
│   │   │   ├── eager_ccl_comm_manager.cpp
│   │   │   ├── eager_ccl_comm_manager.h
│   │   │   ├── eager_nccl_comm_manager.cpp
│   │   │   ├── eager_nccl_comm_manager.h
│   │   │   ├── env.proto
│   │   │   ├── env_desc.cpp
│   │   │   ├── env_desc.h
│   │   │   ├── env_global_objects_scope.cpp
│   │   │   ├── env_global_objects_scope.h
│   │   │   ├── function_config_def.cpp
│   │   │   ├── global_for.cpp
│   │   │   ├── global_for.h
│   │   │   ├── global_mode.cpp
│   │   │   ├── global_mode.h
│   │   │   ├── graph_scope_vars.cpp
│   │   │   ├── graph_scope_vars.h
│   │   │   ├── id_manager.cpp
│   │   │   ├── id_manager.h
│   │   │   ├── id_manager_test.cpp
│   │   │   ├── id_state.h
│   │   │   ├── initializer_conf.proto
│   │   │   ├── inter_job_mem_sharing_util.cpp
│   │   │   ├── inter_job_mem_sharing_util.h
│   │   │   ├── inter_user_job_info.proto
│   │   │   ├── intra_job_mem_sharing_util.cpp
│   │   │   ├── intra_job_mem_sharing_util.h
│   │   │   ├── job.proto
│   │   │   ├── job_build_and_infer_ctx.cpp
│   │   │   ├── job_build_and_infer_ctx.h
│   │   │   ├── job_build_and_infer_ctx_mgr.cpp
│   │   │   ├── job_build_and_infer_ctx_mgr.h
│   │   │   ├── job_builder.cpp
│   │   │   ├── job_builder.h
│   │   │   ├── job_conf.proto
│   │   │   ├── job_desc.cpp
│   │   │   ├── job_desc.h
│   │   │   ├── job_instance.h
│   │   │   ├── job_interpreter.cpp
│   │   │   ├── job_interpreter.h
│   │   │   ├── job_ir.cpp
│   │   │   ├── job_ir.h
│   │   │   ├── job_set.proto
│   │   │   ├── job_set_compile_ctx.h
│   │   │   ├── job_set_compile_ctx.proto
│   │   │   ├── lazy_mode.cpp
│   │   │   ├── lazy_mode.h
│   │   │   ├── learning_rate_schedule_conf.proto
│   │   │   ├── local_parallel.proto
│   │   │   ├── local_sig_infer_hint.h
│   │   │   ├── memory_share_strategy.cpp
│   │   │   ├── memory_share_strategy.h
│   │   │   ├── module_conf.proto
│   │   │   ├── nd_sbp_infer_hint.h
│   │   │   ├── nd_sbp_util.cpp
│   │   │   ├── nd_sbp_util.h
│   │   │   ├── oneflow.cpp
│   │   │   ├── oneflow.h
│   │   │   ├── parallel_conf_signature.proto
│   │   │   ├── parallel_desc.cpp
│   │   │   ├── parallel_desc.h
│   │   │   ├── parallel_desc_test.cpp
│   │   │   ├── parallel_signature.proto
│   │   │   ├── pipeline_config_def.cpp
│   │   │   ├── placement.proto
│   │   │   ├── placement_scope.cpp
│   │   │   ├── placement_scope.h
│   │   │   ├── plan.proto
│   │   │   ├── plan_util.cpp
│   │   │   ├── plan_util.h
│   │   │   ├── qat_config_def.cpp
│   │   │   ├── rank_compiler.cpp
│   │   │   ├── rank_compiler.h
│   │   │   ├── rank_group.cpp
│   │   │   ├── rank_group.h
│   │   │   ├── rank_group_scope.cpp
│   │   │   ├── rank_group_scope.h
│   │   │   ├── rank_group_test.cpp
│   │   │   ├── regularizer_conf.proto
│   │   │   ├── resource.proto
│   │   │   ├── resource_desc.cpp
│   │   │   ├── resource_desc.h
│   │   │   ├── runtime.cpp
│   │   │   ├── runtime.h
│   │   │   ├── runtime_buffer_managers_scope.cpp
│   │   │   ├── runtime_buffer_managers_scope.h
│   │   │   ├── runtime_buffers_scope.cpp
│   │   │   ├── runtime_buffers_scope.h
│   │   │   ├── runtime_context.cpp
│   │   │   ├── runtime_context.h
│   │   │   ├── runtime_job_descs.cpp
│   │   │   ├── runtime_job_descs.h
│   │   │   ├── sbp_infer_hint.h
│   │   │   ├── sbp_parallel.cpp
│   │   │   ├── sbp_parallel.h
│   │   │   ├── sbp_parallel.proto
│   │   │   ├── sbp_signature_builder.cpp
│   │   │   ├── sbp_signature_builder.h
│   │   │   ├── scope.cpp
│   │   │   ├── scope.h
│   │   │   ├── scope.proto
│   │   │   ├── session.cpp
│   │   │   ├── session.h
│   │   │   ├── ssp_config_def.cpp
│   │   │   ├── sub_plan.proto
│   │   │   ├── task.proto
│   │   │   ├── utils/
│   │   │   │   ├── progress_bar.cpp
│   │   │   │   └── progress_bar.h
│   │   │   ├── version.cpp
│   │   │   └── version.h
│   │   ├── job_rewriter/
│   │   │   ├── adadelta_optim.cpp
│   │   │   ├── adagrad_optm.cpp
│   │   │   ├── adam_optm.cpp
│   │   │   ├── add_ssp_variable_proxy.cpp
│   │   │   ├── auto_learning_rate.cpp
│   │   │   ├── auto_mixed_precision.cpp
│   │   │   ├── auto_mixed_precision.h
│   │   │   ├── auto_mixed_precision_lists.cpp
│   │   │   ├── auto_mixed_precision_lists.h
│   │   │   ├── auto_parallel.cpp
│   │   │   ├── auto_train_step.cpp
│   │   │   ├── autograd.cpp
│   │   │   ├── autograd.h
│   │   │   ├── autotick.cpp
│   │   │   ├── autotick.h
│   │   │   ├── boxing_with_middle_nodes.cpp
│   │   │   ├── boxing_with_middle_nodes.h
│   │   │   ├── calculation_pass.cpp
│   │   │   ├── calculation_pass.h
│   │   │   ├── checkpointing_pass.cpp
│   │   │   ├── clip_by_global_norm_job_pass_state.h
│   │   │   ├── clone_grad.cpp
│   │   │   ├── clone_grad.h
│   │   │   ├── cudnn_fused_normalization_add_relu_pass.cpp
│   │   │   ├── cutlass_conv_tuning_warmup_pass.cpp
│   │   │   ├── delay_variable_op_execution_pass.cpp
│   │   │   ├── device_tick_autotick.cpp
│   │   │   ├── do_parallel_cast_before_widening_type_cast_pass.cpp
│   │   │   ├── dump_blob_parallel_conf_pass.cpp
│   │   │   ├── dump_variable_info_pass.cpp
│   │   │   ├── dynamic_loss_scale_job_pass_state.h
│   │   │   ├── dynamic_loss_scale_schedule_pass.cpp
│   │   │   ├── eliminate_dead_nodes_pass.cpp
│   │   │   ├── fix_pipeline_stage_id_pass.cpp
│   │   │   ├── ftrl_optm.cpp
│   │   │   ├── fuse_add_to_output_pass.cpp
│   │   │   ├── fuse_bce_reduce_mean_fw_bw_pass.cpp
│   │   │   ├── fuse_cast_scale_pass.cpp
│   │   │   ├── fuse_consecutive_add_pass.cpp
│   │   │   ├── fuse_embedding_interaction_pass.cpp
│   │   │   ├── fuse_model_update_cast_pass.cpp
│   │   │   ├── fuse_update_ops_pass.cpp
│   │   │   ├── generate_optimizer_op_confs.cpp
│   │   │   ├── group_boxing_by_dst_parallel.cpp
│   │   │   ├── group_boxing_by_dst_parallel.h
│   │   │   ├── indexed_slices_optimizer_rewrite_pass.cpp
│   │   │   ├── input_autotick.cpp
│   │   │   ├── insert_nccl_logical_op_pass.cpp
│   │   │   ├── insert_pinned_identity_op_pass.cpp
│   │   │   ├── job_completer.cpp
│   │   │   ├── job_completer.h
│   │   │   ├── job_pass.cpp
│   │   │   ├── job_pass.h
│   │   │   ├── lamb_optm.cpp
│   │   │   ├── lars_optm.cpp
│   │   │   ├── logical_chain_pass.cpp
│   │   │   ├── momentum_optm.cpp
│   │   │   ├── multi_tensor_model_update.cpp
│   │   │   ├── nccl_logical_chain_strict_order_pass.cpp
│   │   │   ├── nccl_logical_op_fusion_pass.cpp
│   │   │   ├── normalization_exponential_average_auto_tick_rewrite_pass.cpp
│   │   │   ├── optimizer.cpp
│   │   │   ├── optimizer.h
│   │   │   ├── optimizer_placement_optimization_pass.cpp
│   │   │   ├── pass_util.cpp
│   │   │   ├── pass_util.h
│   │   │   ├── pipeline_buffer_pass.cpp
│   │   │   ├── prune_amp_white_identity_op_pass.cpp
│   │   │   ├── prune_cast_to_static_shape_op_pass.cpp
│   │   │   ├── prune_depend_op_pass.cpp
│   │   │   ├── prune_parallel_cast_op_pass.cpp
│   │   │   ├── prune_pinned_identity_op_pass.cpp
│   │   │   ├── quantization_aware_training.cpp
│   │   │   ├── replace_embedding_ops_pass.cpp
│   │   │   ├── rmsprop_optm.cpp
│   │   │   ├── sequential_one_embedding_shuffle_ops_pass.cpp
│   │   │   ├── sgd_optm.cpp
│   │   │   ├── source_user_op_auto_tick.cpp
│   │   │   ├── split_sparse_softmax_cross_entropy_op_pass.cpp
│   │   │   ├── system_op_fill_job_name_pass.cpp
│   │   │   ├── tick_autotick.cpp
│   │   │   └── variable_autotick.cpp
│   │   ├── kernel/
│   │   │   ├── assign_kernel.cpp
│   │   │   ├── blob_access_checker_kernel_observer.cpp
│   │   │   ├── blob_access_checker_kernel_observer.h
│   │   │   ├── blob_tensor_view.cpp
│   │   │   ├── blob_tensor_view.h
│   │   │   ├── boxing_kernel.cpp
│   │   │   ├── boxing_zeros_kernel.cpp
│   │   │   ├── broadcast_to_compatible_with_kernel.cpp
│   │   │   ├── callback_notify_kernel.cpp
│   │   │   ├── case_kernel.cpp
│   │   │   ├── case_kernel.h
│   │   │   ├── chain_kernel_observer.cpp
│   │   │   ├── chain_kernel_observer.h
│   │   │   ├── collective_boxing_kernels.cpp
│   │   │   ├── collective_boxing_pack_kernel.cpp
│   │   │   ├── collective_boxing_unpack_kernel.cpp
│   │   │   ├── constant_like_kernel.cpp
│   │   │   ├── cpu_check_numerics_kernel_observer.h
│   │   │   ├── cpu_numerics_kernel_observer.cpp
│   │   │   ├── critical_section_callback_tick_kernel.cpp
│   │   │   ├── critical_section_wait_tick_kernel.cpp
│   │   │   ├── cuda_check_numerics_kernel_observer.cu
│   │   │   ├── cuda_check_numerics_kernel_observer.h
│   │   │   ├── cuda_graph_support.h
│   │   │   ├── distribute_kernels.cpp
│   │   │   ├── dynamic_reshape_kernel.cpp
│   │   │   ├── dynamic_reshape_like_kernel.cpp
│   │   │   ├── esac_kernel.cpp
│   │   │   ├── esac_kernel.h
│   │   │   ├── identity_kernel.cpp
│   │   │   ├── image_decoder_random_crop_resize_kernel.cpp
│   │   │   ├── input_kernel.cpp
│   │   │   ├── kernel.cpp
│   │   │   ├── kernel.h
│   │   │   ├── kernel.proto
│   │   │   ├── kernel_context.h
│   │   │   ├── kernel_observer.h
│   │   │   ├── kernel_registration.cpp
│   │   │   ├── kernel_registration.h
│   │   │   ├── kernel_util.cpp
│   │   │   ├── kernel_util.cuh
│   │   │   ├── kernel_util.h
│   │   │   ├── learning_rate_schedule_kernel.cpp
│   │   │   ├── nccl_send_recv_boxing_kernel.cpp
│   │   │   ├── new_kernel_util.h
│   │   │   ├── nop_kernel.cpp
│   │   │   ├── output_kernel.cpp
│   │   │   ├── profiler_kernel_observer.cpp
│   │   │   ├── profiler_kernel_observer.h
│   │   │   ├── random_generator.cpp
│   │   │   ├── random_generator.cu
│   │   │   ├── random_generator.h
│   │   │   ├── reentrant_lock_kernel.cpp
│   │   │   ├── reentrant_lock_kernel.h
│   │   │   ├── return_kernel.cpp
│   │   │   ├── runtime_blob_shape_infer_helper.cpp
│   │   │   ├── runtime_blob_shape_infer_helper.h
│   │   │   ├── shape_elem_cnt_kernel.cpp
│   │   │   ├── slice_boxing_kernel.cpp
│   │   │   ├── sync_check_kernel_observer.cpp
│   │   │   ├── sync_check_kernel_observer.h
│   │   │   ├── sync_dynamic_resize_kernel.cpp
│   │   │   ├── total_loss_instance_num_kernel.cpp
│   │   │   ├── user_kernel.cpp
│   │   │   ├── user_kernel.h
│   │   │   ├── util/
│   │   │   │   ├── cuda_half_util.h
│   │   │   │   ├── numeric_limits.cuh
│   │   │   │   └── numerics.cuh
│   │   │   ├── wait_and_send_ids_kernel.cpp
│   │   │   └── wait_and_send_ids_kernel.h
│   │   ├── lazy/
│   │   │   ├── actor/
│   │   │   │   ├── acc_actor.cpp
│   │   │   │   ├── acc_ctrl_tick_actor.cpp
│   │   │   │   ├── acc_tick_actor.cpp
│   │   │   │   ├── actor.cpp
│   │   │   │   ├── actor.h
│   │   │   │   ├── actor_base.cpp
│   │   │   │   ├── actor_base.h
│   │   │   │   ├── actor_context.cpp
│   │   │   │   ├── actor_context.h
│   │   │   │   ├── actor_message.cpp
│   │   │   │   ├── actor_message.h
│   │   │   │   ├── actor_message_bus.cpp
│   │   │   │   ├── actor_message_bus.h
│   │   │   │   ├── boxing_zeros_actor.cpp
│   │   │   │   ├── callback_notify_actor.cpp
│   │   │   │   ├── case_actor.cpp
│   │   │   │   ├── collective_boxing_actor_context.cpp
│   │   │   │   ├── collective_boxing_actor_context.h
│   │   │   │   ├── copy_comm_net_actor.cpp
│   │   │   │   ├── esac_actor.cpp
│   │   │   │   ├── generic_actor_context.cpp
│   │   │   │   ├── generic_actor_context.h
│   │   │   │   ├── input_wise_actor.cpp
│   │   │   │   ├── input_wise_actor.h
│   │   │   │   ├── light_actor.cpp
│   │   │   │   ├── light_actor.h
│   │   │   │   ├── naive_actor.cpp
│   │   │   │   ├── naive_actor.h
│   │   │   │   ├── pack_actor.cpp
│   │   │   │   ├── reentrant_lock_actor.cpp
│   │   │   │   ├── register_slot.cpp
│   │   │   │   ├── register_slot.h
│   │   │   │   ├── repeat_actor.cpp
│   │   │   │   ├── sink_actor.cpp
│   │   │   │   ├── sink_actor.h
│   │   │   │   ├── source_tick_actor.cpp
│   │   │   │   ├── ssp_variable_proxy_actor.cpp
│   │   │   │   ├── tick_actor.cpp
│   │   │   │   ├── unpack_actor.cpp
│   │   │   │   └── wait_and_send_ids_actor.cpp
│   │   │   └── stream_context/
│   │   │       ├── common/
│   │   │       │   └── generic_stream_context.cpp
│   │   │       ├── cpu/
│   │   │       │   └── cpu_stream_context.cpp
│   │   │       ├── cuda/
│   │   │       │   └── cuda_stream_context.cpp
│   │   │       └── include/
│   │   │           ├── generic_stream_context.h
│   │   │           └── stream_context.h
│   │   ├── memory/
│   │   │   ├── chunk_manager.cpp
│   │   │   ├── chunk_manager.h
│   │   │   ├── memory_allocator.cpp
│   │   │   ├── memory_allocator.h
│   │   │   ├── memory_block.proto
│   │   │   ├── memory_case.proto
│   │   │   ├── memory_case_util.cpp
│   │   │   ├── memory_case_util.h
│   │   │   ├── memory_zone.cpp
│   │   │   └── memory_zone.h
│   │   ├── ndarray/
│   │   │   ├── binary_func.h
│   │   │   ├── cpu_concat_var_ndarray.h
│   │   │   ├── cpu_concat_var_ndarray_test.cpp
│   │   │   ├── cpu_ndarray.h
│   │   │   ├── cpu_ndarray_builder.h
│   │   │   ├── cpu_ndarray_copy.h
│   │   │   ├── cpu_slice_var_ndarray.h
│   │   │   ├── cpu_slice_var_ndarray_test.cpp
│   │   │   ├── cpu_var_ndarray.h
│   │   │   ├── cpu_var_ndarray_test.cpp
│   │   │   ├── ndarray_apply_binary.h
│   │   │   ├── ndarray_apply_binary_core.cpp
│   │   │   ├── ndarray_apply_binary_core.cu
│   │   │   ├── ndarray_apply_binary_core.h
│   │   │   ├── ndarray_apply_broadcast_binary.h
│   │   │   ├── ndarray_apply_broadcast_binary_core.cpp
│   │   │   ├── ndarray_apply_broadcast_binary_core.cu
│   │   │   ├── ndarray_apply_broadcast_binary_core.h
│   │   │   ├── ndarray_apply_broadcast_unary.h
│   │   │   ├── ndarray_apply_broadcast_unary_core.cpp
│   │   │   ├── ndarray_apply_broadcast_unary_core.cu
│   │   │   ├── ndarray_apply_broadcast_unary_core.h
│   │   │   ├── ndarray_apply_unary.h
│   │   │   ├── ndarray_apply_unary_core.cpp
│   │   │   ├── ndarray_apply_unary_core.cu
│   │   │   ├── ndarray_apply_unary_core.h
│   │   │   ├── ndarray_assign_core.cpp
│   │   │   ├── ndarray_assign_core.cu
│   │   │   ├── ndarray_assign_core.h
│   │   │   ├── ndarray_reduce.h
│   │   │   ├── ndarray_reduce_impl.cpp
│   │   │   ├── ndarray_reduce_impl.cu
│   │   │   ├── ndarray_reduce_impl.h
│   │   │   ├── ndarray_util.h
│   │   │   ├── slice.cpp
│   │   │   ├── slice.h
│   │   │   ├── slice_test.cpp
│   │   │   ├── unary_func.h
│   │   │   ├── xpu_binary_func_ndarray.h
│   │   │   ├── xpu_broadcast_ndarray.h
│   │   │   ├── xpu_ndarray_assign.cu
│   │   │   ├── xpu_ndarray_assign.h
│   │   │   ├── xpu_ndarray_base.h
│   │   │   ├── xpu_reduced_ndarray.h
│   │   │   ├── xpu_reshape_ndarray.h
│   │   │   ├── xpu_shape.cpp
│   │   │   ├── xpu_shape.h
│   │   │   ├── xpu_transpose_ndarray.h
│   │   │   ├── xpu_unary_func_ndarray.h
│   │   │   ├── xpu_util.h
│   │   │   ├── xpu_var_ndarray.h
│   │   │   └── xpu_var_ndarray_builder.h
│   │   ├── operator/
│   │   │   ├── acc_tick_op.cpp
│   │   │   ├── acc_tick_op.h
│   │   │   ├── arg_modifier_signature.proto
│   │   │   ├── assign_op.cpp
│   │   │   ├── boxing_identity_op.cpp
│   │   │   ├── boxing_op.cpp
│   │   │   ├── boxing_op.h
│   │   │   ├── boxing_zeros_op.cpp
│   │   │   ├── broadcast_to_compatible_with_op.cpp
│   │   │   ├── callback_notify_op.cpp
│   │   │   ├── callback_notify_op.h
│   │   │   ├── case_op.cpp
│   │   │   ├── case_op.h
│   │   │   ├── collective_boxing_ops.cpp
│   │   │   ├── collective_boxing_pack_op.cpp
│   │   │   ├── collective_boxing_unpack_op.cpp
│   │   │   ├── constant_like_op.cpp
│   │   │   ├── copy_comm_net_op.cpp
│   │   │   ├── copy_comm_net_op.h
│   │   │   ├── critical_section_callback_tick_op.cpp
│   │   │   ├── critical_section_wait_tick_op.cpp
│   │   │   ├── cwise_op.cpp
│   │   │   ├── cwise_op.h
│   │   │   ├── decode_random_op.h
│   │   │   ├── device_tick_op.cpp
│   │   │   ├── device_tick_op.h
│   │   │   ├── distribute_add_op.cpp
│   │   │   ├── distribute_clone_op.cpp
│   │   │   ├── distribute_concat_op.cpp
│   │   │   ├── distribute_split_op.cpp
│   │   │   ├── dst_subset_tick_op.cpp
│   │   │   ├── dynamic_reshape_op.cpp
│   │   │   ├── esac_op.cpp
│   │   │   ├── esac_op.h
│   │   │   ├── identity_op.cpp
│   │   │   ├── image_decoder_random_crop_resize_op.cpp
│   │   │   ├── input_op.cpp
│   │   │   ├── input_op.h
│   │   │   ├── interface_blob_conf.proto
│   │   │   ├── interface_op_util.cpp
│   │   │   ├── interface_op_util.h
│   │   │   ├── learning_rate_schedule_op.cpp
│   │   │   ├── nccl_send_recv_boxing_op.cpp
│   │   │   ├── nccl_send_recv_boxing_op_util.cpp
│   │   │   ├── nccl_send_recv_boxing_op_util.h
│   │   │   ├── op_attribute.proto
│   │   │   ├── op_conf.proto
│   │   │   ├── op_conf_symbol.cpp
│   │   │   ├── op_conf_symbol.h
│   │   │   ├── op_conf_util.h
│   │   │   ├── op_infer_cache.h
│   │   │   ├── op_node_signature.proto
│   │   │   ├── operator.cpp
│   │   │   ├── operator.h
│   │   │   ├── operator_util.cpp
│   │   │   ├── operator_util.h
│   │   │   ├── output_op.cpp
│   │   │   ├── output_op.h
│   │   │   ├── reduce_sbp_util.cpp
│   │   │   ├── reduce_sbp_util.h
│   │   │   ├── reentrant_lock_op.cpp
│   │   │   ├── reentrant_lock_op.h
│   │   │   ├── return_op.cpp
│   │   │   ├── return_op.h
│   │   │   ├── scalar_op_base.cpp
│   │   │   ├── scalar_op_base.h
│   │   │   ├── shape_elem_cnt_op.cpp
│   │   │   ├── shape_elem_cnt_op.h
│   │   │   ├── sink_tick_op.cpp
│   │   │   ├── sink_tick_op.h
│   │   │   ├── slice_boxing_op.cpp
│   │   │   ├── source_tick_op.cpp
│   │   │   ├── source_tick_op.h
│   │   │   ├── src_subset_tick_op.cpp
│   │   │   ├── sync_dynamic_resize_op.cpp
│   │   │   ├── tick_op.cpp
│   │   │   ├── tick_op.h
│   │   │   ├── total_loss_instance_num_op.cpp
│   │   │   ├── total_loss_instance_num_op.h
│   │   │   ├── user_op.cpp
│   │   │   ├── user_op.h
│   │   │   ├── variable_op.cpp
│   │   │   ├── variable_op.h
│   │   │   ├── wait_and_send_ids_op.cpp
│   │   │   └── wait_and_send_ids_op.h
│   │   ├── persistence/
│   │   │   ├── binary_in_stream.h
│   │   │   ├── binary_in_stream_with_local_copy.cpp
│   │   │   ├── binary_in_stream_with_local_copy.h
│   │   │   ├── binary_in_stream_without_local_copy.cpp
│   │   │   ├── binary_in_stream_without_local_copy.h
│   │   │   ├── file_system.cpp
│   │   │   ├── file_system.h
│   │   │   ├── file_system_test.cpp
│   │   │   ├── hadoop/
│   │   │   │   ├── hadoop_file_system.cpp
│   │   │   │   ├── hadoop_file_system.h
│   │   │   │   └── hdfs.h
│   │   │   ├── persistent_in_stream.cpp
│   │   │   ├── persistent_in_stream.h
│   │   │   ├── persistent_out_stream.cpp
│   │   │   ├── persistent_out_stream.h
│   │   │   ├── posix/
│   │   │   │   ├── posix_file_system.cpp
│   │   │   │   └── posix_file_system.h
│   │   │   ├── stream_scanner.cpp
│   │   │   ├── stream_scanner.h
│   │   │   ├── tee_persistent_log_stream.cpp
│   │   │   └── tee_persistent_log_stream.h
│   │   ├── platform/
│   │   │   ├── include/
│   │   │   │   ├── ibv.h
│   │   │   │   ├── pthread_fork.h
│   │   │   │   └── wrapper.h
│   │   │   └── lib/
│   │   │       ├── ibv_wrapper.cpp
│   │   │       ├── pthread_fork.cpp
│   │   │       └── wrapper.cpp
│   │   ├── profiler/
│   │   │   ├── event.cpp
│   │   │   ├── event.h
│   │   │   ├── event_recorder.cpp
│   │   │   ├── event_recorder.h
│   │   │   ├── kernel.cpp
│   │   │   ├── kernel.h
│   │   │   ├── kineto_shim.cpp
│   │   │   ├── kineto_shim.h
│   │   │   ├── profile_manager.cpp
│   │   │   ├── profile_manager.h
│   │   │   ├── profiler.cpp
│   │   │   ├── profiler.h
│   │   │   └── util.h
│   │   ├── record/
│   │   │   ├── coco.proto
│   │   │   └── record.proto
│   │   ├── register/
│   │   │   ├── blob.cpp
│   │   │   ├── blob.h
│   │   │   ├── blob_desc.cpp
│   │   │   ├── blob_desc.h
│   │   │   ├── blob_desc.proto
│   │   │   ├── logical_blob_id.proto
│   │   │   ├── op_blob_arg.proto
│   │   │   ├── op_blob_arg_info.h
│   │   │   ├── register.cpp
│   │   │   ├── register.h
│   │   │   ├── register_desc.cpp
│   │   │   ├── register_desc.h
│   │   │   ├── register_desc.proto
│   │   │   ├── register_manager.cpp
│   │   │   ├── register_manager.h
│   │   │   ├── runtime_register_desc.cpp
│   │   │   ├── runtime_register_desc.h
│   │   │   ├── tensor_slice_copier.cpp
│   │   │   ├── tensor_slice_copier.h
│   │   │   ├── tensor_slice_view.cpp
│   │   │   ├── tensor_slice_view.h
│   │   │   └── tensor_slice_view.proto
│   │   ├── rpc/
│   │   │   ├── include/
│   │   │   │   ├── base.h
│   │   │   │   ├── ctrl.h
│   │   │   │   ├── global_process_ctx.h
│   │   │   │   ├── grpc.h
│   │   │   │   ├── local.h
│   │   │   │   └── manager.h
│   │   │   └── lib/
│   │   │       ├── global_process_ctx.cpp
│   │   │       ├── grpc.cpp
│   │   │       └── local.cpp
│   │   ├── summary/
│   │   │   ├── event.proto
│   │   │   ├── graph.proto
│   │   │   ├── plugin_data.proto
│   │   │   ├── projector.proto
│   │   │   ├── summary.proto
│   │   │   └── tensor.proto
│   │   ├── thread/
│   │   │   ├── is_main_thread_test.cpp
│   │   │   ├── thread.cpp
│   │   │   ├── thread.h
│   │   │   ├── thread_global_id.cpp
│   │   │   ├── thread_global_id.h
│   │   │   ├── thread_manager.cpp
│   │   │   ├── thread_manager.h
│   │   │   ├── thread_pool.cpp
│   │   │   ├── thread_pool.h
│   │   │   ├── thread_runtime.h
│   │   │   ├── thread_runtime_factory.cpp
│   │   │   └── thread_runtime_factory.h
│   │   ├── transport/
│   │   │   ├── transport.cpp
│   │   │   ├── transport.h
│   │   │   └── transport_message.h
│   │   └── vm/
│   │       ├── access_blob_arg_cb_instruction_policy.h
│   │       ├── allocate_tensor_instruction_policy.cpp
│   │       ├── allocate_tensor_instruction_policy.h
│   │       ├── allocator.h
│   │       ├── barrier_instruction_policy.h
│   │       ├── bin_allocator.h
│   │       ├── bin_allocator_test.cpp
│   │       ├── caching_allocator.h
│   │       ├── control_stream_policy.h
│   │       ├── critical_section_instruction_policy.cpp
│   │       ├── critical_section_instruction_policy.h
│   │       ├── critical_section_status_querier.h
│   │       ├── critical_section_stream_policy.cpp
│   │       ├── critical_section_stream_policy.h
│   │       ├── ep_backend_allocator.cpp
│   │       ├── ep_backend_allocator.h
│   │       ├── ep_backend_host_allocator.cpp
│   │       ├── ep_backend_host_allocator.h
│   │       ├── ep_d2h_stream_policy.cpp
│   │       ├── ep_d2h_stream_policy.h
│   │       ├── ep_event.cpp
│   │       ├── ep_event.h
│   │       ├── ep_optional_event_record_status_querier.cpp
│   │       ├── ep_optional_event_record_status_querier.h
│   │       ├── ep_record_event_instruction_policy.h
│   │       ├── ep_stream_policy.cpp
│   │       ├── ep_stream_policy.h
│   │       ├── ep_stream_policy_base.cpp
│   │       ├── ep_stream_policy_base.h
│   │       ├── event_recorded_ep_stream_policy.cpp
│   │       ├── event_recorded_ep_stream_policy.h
│   │       ├── fuse_instruction_policy.h
│   │       ├── global_sync_instruction_policy.h
│   │       ├── instruction.cpp
│   │       ├── instruction.h
│   │       ├── instruction_fuse_type.h
│   │       ├── instruction_policy.cpp
│   │       ├── instruction_policy.h
│   │       ├── instruction_policy_util.h
│   │       ├── lazy_job_instruction_policy.h
│   │       ├── lazy_job_stream_policy.cpp
│   │       ├── lazy_job_stream_policy.h
│   │       ├── naive_instruction_status_querier.h
│   │       ├── op_call_instruction_policy.cpp
│   │       ├── op_call_instruction_policy.h
│   │       ├── pinned_ep_stream_policy.cpp
│   │       ├── pinned_ep_stream_policy.h
│   │       ├── probe.h
│   │       ├── ref_cnt_instruction_status_querier.h
│   │       ├── release_tensor_instruction_policy.h
│   │       ├── remat/
│   │       │   ├── allocator.cpp
│   │       │   ├── allocator.h
│   │       │   ├── disjoint_set.cpp
│   │       │   ├── disjoint_set.h
│   │       │   ├── env.cpp
│   │       │   ├── env.h
│   │       │   ├── util.cpp
│   │       │   └── util.h
│   │       ├── stream.cpp
│   │       ├── stream.h
│   │       ├── stream_create_stream_policy.h
│   │       ├── stream_get_allocator_stream_type.h
│   │       ├── stream_policy.cpp
│   │       ├── stream_policy.h
│   │       ├── stream_record_event_instruction_policy.cpp
│   │       ├── stream_record_event_instruction_policy.h
│   │       ├── stream_wait_event_instruction_policy.cpp
│   │       ├── stream_wait_event_instruction_policy.h
│   │       ├── stream_wait_instruction_policy.cpp
│   │       ├── stream_wait_instruction_policy.h
│   │       ├── symbol_storage.cpp
│   │       ├── symbol_storage.h
│   │       ├── sync_access_instruction_policy.cpp
│   │       ├── sync_access_instruction_policy.h
│   │       ├── sync_vm_mode_guard.h
│   │       ├── thread_ctx.cpp
│   │       ├── thread_ctx.h
│   │       ├── thread_safe_guard.h
│   │       ├── touch_tensors_instruction_policy.h
│   │       ├── virtual_machine.cpp
│   │       ├── virtual_machine.h
│   │       ├── virtual_machine_engine.cpp
│   │       ├── virtual_machine_engine.h
│   │       ├── virtual_machine_scope.cpp
│   │       ├── virtual_machine_scope.h
│   │       ├── vm_object.cpp
│   │       ├── vm_object.h
│   │       ├── vm_sync.h
│   │       ├── vm_util.cpp
│   │       └── vm_util.h
│   ├── extension/
│   │   ├── python/
│   │   │   ├── numpy.cpp
│   │   │   ├── numpy.h
│   │   │   ├── numpy_internal.h
│   │   │   ├── py_compute.cpp
│   │   │   ├── py_compute.h
│   │   │   ├── py_kernel_caller.cpp
│   │   │   ├── py_kernel_caller.h
│   │   │   ├── py_kernel_registry.cpp
│   │   │   └── py_kernel_registry.h
│   │   └── stack/
│   │       ├── foreign_stack_getter.h
│   │       ├── python/
│   │       │   ├── custom_eval_frame.c
│   │       │   ├── custom_eval_frame.h
│   │       │   ├── stack_getter.cpp
│   │       │   └── stack_getter.h
│   │       └── stacktrace.h
│   ├── ir/
│   │   ├── .gitignore
│   │   ├── CMakeLists.txt
│   │   ├── README.md
│   │   ├── include/
│   │   │   ├── CMakeLists.txt
│   │   │   ├── OneFlow/
│   │   │   │   ├── CMakeLists.txt
│   │   │   │   ├── Conversion/
│   │   │   │   │   ├── NVVMToCubin.h
│   │   │   │   │   └── OneFlowToTosa.h
│   │   │   │   ├── Extension.h
│   │   │   │   ├── OKL/
│   │   │   │   │   ├── Conversion/
│   │   │   │   │   │   ├── Conversion.h
│   │   │   │   │   │   └── OKLToLLVM.h
│   │   │   │   │   ├── Kernel/
│   │   │   │   │   │   ├── ComputeContext.h
│   │   │   │   │   │   ├── InferContext.h
│   │   │   │   │   │   ├── InitContext.h
│   │   │   │   │   │   ├── JITEngine.h
│   │   │   │   │   │   ├── JITOpInfer.h
│   │   │   │   │   │   ├── LauncherContext.h
│   │   │   │   │   │   ├── LauncherState.h
│   │   │   │   │   │   ├── README.md
│   │   │   │   │   │   ├── RegContext.h
│   │   │   │   │   │   ├── TmpBufferManager.h
│   │   │   │   │   │   └── WrapperContext.h
│   │   │   │   │   ├── OKLAttributes.h
│   │   │   │   │   ├── OKLAttributes.td
│   │   │   │   │   ├── OKLBase.td
│   │   │   │   │   ├── OKLDialect.h
│   │   │   │   │   ├── OKLDialect.td
│   │   │   │   │   ├── OKLOps.h
│   │   │   │   │   ├── OKLOps.td
│   │   │   │   │   ├── OKLTypes.h
│   │   │   │   │   ├── OKLTypes.td
│   │   │   │   │   └── passes.h
│   │   │   │   ├── OKM/
│   │   │   │   │   ├── Conversion/
│   │   │   │   │   │   └── Conversion.h
│   │   │   │   │   ├── OKMAttributes.h
│   │   │   │   │   ├── OKMAttributes.td
│   │   │   │   │   ├── OKMBase.td
│   │   │   │   │   ├── OKMDialect.h
│   │   │   │   │   ├── OKMDialect.td
│   │   │   │   │   ├── OKMOps.h
│   │   │   │   │   ├── OKMOps.td
│   │   │   │   │   ├── OKMPasses.td
│   │   │   │   │   └── passes.h
│   │   │   │   ├── OneFlowBase.td
│   │   │   │   ├── OneFlowDataTypeConversion.h
│   │   │   │   ├── OneFlowDialect.h
│   │   │   │   ├── OneFlowDialect.td
│   │   │   │   ├── OneFlowEnums.td
│   │   │   │   ├── OneFlowInterfaces.td
│   │   │   │   ├── OneFlowOpGetGen.td
│   │   │   │   ├── OneFlowOpTraits.h
│   │   │   │   ├── OneFlowOps.h
│   │   │   │   ├── OneFlowOps.td
│   │   │   │   ├── OneFlowPDLLPatterns.h
│   │   │   │   ├── OneFlowPasses.td
│   │   │   │   ├── OneFlowPatternUtils.h
│   │   │   │   ├── OneFlowPatterns.td
│   │   │   │   ├── OneFlowSupport.h
│   │   │   │   ├── OneFlowTypes.h
│   │   │   │   ├── OneFlowUserOps.td
│   │   │   │   ├── OneFlowUtils.h
│   │   │   │   ├── Passes.h
│   │   │   │   ├── SBP/
│   │   │   │   │   ├── SBPAttributes.h
│   │   │   │   │   ├── SBPBase.td
│   │   │   │   │   ├── SBPDialect.h
│   │   │   │   │   ├── SBPDialect.td
│   │   │   │   │   ├── SBPImporter.h
│   │   │   │   │   └── SBPOps.td
│   │   │   │   ├── Transform/
│   │   │   │   │   ├── AggregateOps.h
│   │   │   │   │   ├── AutoNhwc.h
│   │   │   │   │   ├── BufferHostRegister.h
│   │   │   │   │   ├── CSEWithAttributesIgnored.h
│   │   │   │   │   ├── ConvertInferenceOp.h
│   │   │   │   │   ├── EliminateAllocOps.h
│   │   │   │   │   ├── FuncOps.h
│   │   │   │   │   ├── OneFlow MLIR CodeGen ABI.md
│   │   │   │   │   ├── OneFlowMemPool.h
│   │   │   │   │   ├── OneFlowStream.h
│   │   │   │   │   ├── OutlineAndFuse.h
│   │   │   │   │   ├── TraitFolder.h
│   │   │   │   │   └── TransposeHelpers.h
│   │   │   │   ├── UserOpConversion.h
│   │   │   │   └── UserOpReflection.h
│   │   │   └── Transform/
│   │   │       ├── CMakeLists.txt
│   │   │       ├── TransformDialectExtension.h
│   │   │       ├── TransformDialectExtension.td
│   │   │       └── TransformStateExtension.h
│   │   ├── install-llvm.cmake
│   │   ├── lib/
│   │   │   ├── CMakeLists.txt
│   │   │   ├── OneFlow/
│   │   │   │   ├── CMakeLists.txt
│   │   │   │   ├── Conversion/
│   │   │   │   │   ├── NVVMToCubin.cpp
│   │   │   │   │   ├── OneFlowToLinalg.cpp
│   │   │   │   │   └── OneFlowToTosa.cpp
│   │   │   │   ├── OKL/
│   │   │   │   │   ├── Conversion/
│   │   │   │   │   │   ├── Conversion.cpp
│   │   │   │   │   │   ├── CudaGraphSupport.cpp
│   │   │   │   │   │   └── OKLToLLVM.cpp
│   │   │   │   │   ├── Kernel/
│   │   │   │   │   │   ├── ComputeContext.cpp
│   │   │   │   │   │   ├── InferContext.cpp
│   │   │   │   │   │   ├── JITEngine.cpp
│   │   │   │   │   │   ├── JITOpInfer.cpp
│   │   │   │   │   │   ├── KernelLaunchOp.cpp
│   │   │   │   │   │   ├── LauncherContext.cpp
│   │   │   │   │   │   ├── LauncherState.cpp
│   │   │   │   │   │   ├── RegContext.cpp
│   │   │   │   │   │   └── TmpBufferManager.cpp
│   │   │   │   │   ├── OKLDialect.cpp
│   │   │   │   │   ├── OKLOps.cpp
│   │   │   │   │   ├── OKLTypes.cpp
│   │   │   │   │   └── README-OriginVersion.md
│   │   │   │   ├── OKM/
│   │   │   │   │   ├── Conversion/
│   │   │   │   │   │   └── Conversion.cpp
│   │   │   │   │   ├── OKMDialect.cpp
│   │   │   │   │   └── passes.cpp
│   │   │   │   ├── OneFlowCanonicalizers.cpp
│   │   │   │   ├── OneFlowDataTypeConversion.cpp
│   │   │   │   ├── OneFlowDialect.cpp
│   │   │   │   ├── OneFlowInferReturnTypes.cpp
│   │   │   │   ├── OneFlowOpFolders.cpp
│   │   │   │   ├── OneFlowOpGetGen.cpp.in
│   │   │   │   ├── OneFlowOpTraits.cpp
│   │   │   │   ├── OneFlowOps.cpp
│   │   │   │   ├── OneFlowRewrites.cpp
│   │   │   │   ├── OneFlowSupport.cpp
│   │   │   │   ├── OneFlowTypes.cpp
│   │   │   │   ├── OneFlowUtils.cpp
│   │   │   │   ├── PDLL/
│   │   │   │   │   ├── AllocEliminationPatterns.cpp
│   │   │   │   │   ├── AllocEliminationPatterns.pdll
│   │   │   │   │   ├── CMakeLists.txt
│   │   │   │   │   ├── ForwardOpPatterns.cpp
│   │   │   │   │   ├── ForwardOpPatterns.pdll
│   │   │   │   │   ├── FuseConv2DBatchNormPattern.cpp
│   │   │   │   │   ├── FuseConv2DBatchNormPattern.pdll
│   │   │   │   │   ├── FuseOpsWithBackwardImplPattern.cpp
│   │   │   │   │   ├── FuseOpsWithBackwardImplPattern.pdll
│   │   │   │   │   ├── NormalizationPatterns.cpp
│   │   │   │   │   ├── NormalizationPatterns.pdll
│   │   │   │   │   └── OneFlowPDLLUtils.pdll
│   │   │   │   ├── Passes.cpp
│   │   │   │   ├── SBP/
│   │   │   │   │   ├── SBPAttributes.cpp
│   │   │   │   │   ├── SBPDialect.cpp
│   │   │   │   │   └── SBPImporter.cpp
│   │   │   │   ├── Transform/
│   │   │   │   │   ├── AggregateOps.cpp
│   │   │   │   │   ├── AutoNHWCOps.cpp
│   │   │   │   │   ├── AutoNhwc.cpp
│   │   │   │   │   ├── BufferHostRegister.cpp
│   │   │   │   │   ├── CSEWithAttributesIgnored.cpp
│   │   │   │   │   ├── ConvertInferenceOp.cpp
│   │   │   │   │   ├── EliminateAllocOps.cpp
│   │   │   │   │   ├── FuncOps.cpp
│   │   │   │   │   ├── GroupMatMulOps.cpp
│   │   │   │   │   ├── JITPasses.cpp
│   │   │   │   │   ├── OneFlowMemPool.cpp
│   │   │   │   │   ├── OneFlowStream.cpp
│   │   │   │   │   ├── OutlineAndFuse.cpp
│   │   │   │   │   └── TraitFolder.cpp
│   │   │   │   ├── TransposeHelpers.cpp
│   │   │   │   ├── UserOpConversion.cpp
│   │   │   │   └── UserOpReflection.cpp
│   │   │   └── Transform/
│   │   │       ├── CMakeLists.txt
│   │   │       ├── TransformDialectExtension.cpp
│   │   │       ├── TransformDialectInterpreter.cpp
│   │   │       └── TransformStateExtension.cpp
│   │   ├── llvm-in-tree.cmake
│   │   ├── oneflow-extension/
│   │   │   ├── CMakeLists.txt
│   │   │   ├── README.md
│   │   │   ├── include/
│   │   │   │   ├── CMakeLists.txt
│   │   │   │   ├── OneFlow/
│   │   │   │   │   ├── CMakeLists.txt
│   │   │   │   │   ├── JITOpInfer.h
│   │   │   │   │   ├── OneFlowLRJITRegistry.h
│   │   │   │   │   └── OneFlowRoundTrip.h
│   │   │   │   └── PyAst/
│   │   │   │       ├── Ast.h
│   │   │   │       └── AstMlirGen.h
│   │   │   ├── ir_pass.cpp
│   │   │   ├── lr_jit.cpp
│   │   │   ├── mlir_gen.cpp
│   │   │   ├── mlir_jit_op.cpp
│   │   │   └── mlir_jit_op_kernel.cpp
│   │   ├── oneflow-lite/
│   │   │   ├── CMakeLists.txt
│   │   │   ├── OneFlowLiteCompileMain.cpp
│   │   │   ├── include/
│   │   │   │   └── OneFlow/
│   │   │   │       ├── ConvertToLiteExecutable.h
│   │   │   │       ├── FlatbufferUtils.h
│   │   │   │       ├── OneFlowLiteUtils.h
│   │   │   │       └── Transform/
│   │   │   │           ├── FoldVariable.h
│   │   │   │           ├── InferPlacement.h
│   │   │   │           ├── InsertTransferOp.h
│   │   │   │           ├── Lowering/
│   │   │   │           │   ├── LoweringAscend.h
│   │   │   │           │   └── LoweringAscendUtils.h
│   │   │   │           ├── LoweringLaunchJob.h
│   │   │   │           ├── MemoryPlanning.h
│   │   │   │           └── PartitionLaunchJob.h
│   │   │   ├── lib/
│   │   │   │   ├── CMakeLists.txt
│   │   │   │   └── OneFlow/
│   │   │   │       ├── CMakeLists.txt
│   │   │   │       ├── ConvertToLiteExecutable.cpp
│   │   │   │       ├── FlatbufferUtils.cpp
│   │   │   │       ├── OneFlowLiteUtils.cpp
│   │   │   │       ├── Transform/
│   │   │   │       │   ├── FoldVariable.cpp
│   │   │   │       │   ├── InferPlacement.cpp
│   │   │   │       │   ├── InsertTransferOp.cpp
│   │   │   │       │   ├── Lowering/
│   │   │   │       │   │   └── LoweringAscend.cpp
│   │   │   │       │   ├── LoweringLaunchJob.cpp
│   │   │   │       │   ├── MemoryPlanning.cpp
│   │   │   │       │   └── PartitionLaunchJob.cpp
│   │   │   │       └── cmake/
│   │   │   │           └── FindAscendSdk.cmake
│   │   │   └── schemas/
│   │   │       ├── CMakeLists.txt
│   │   │       ├── attributes/
│   │   │       │   ├── CMakeLists.txt
│   │   │       │   ├── bool.fbs
│   │   │       │   ├── f32.fbs
│   │   │       │   ├── f32s.fbs
│   │   │       │   ├── f64.fbs
│   │   │       │   ├── i32.fbs
│   │   │       │   ├── i32s.fbs
│   │   │       │   ├── i64.fbs
│   │   │       │   ├── i64s.fbs
│   │   │       │   ├── shape.fbs
│   │   │       │   ├── shapes.fbs
│   │   │       │   ├── str.fbs
│   │   │       │   └── strs.fbs
│   │   │       ├── executable.fbs
│   │   │       └── install_flatcc.cmake
│   │   ├── oneflow-opt/
│   │   │   ├── CMakeLists.txt
│   │   │   ├── README.md
│   │   │   └── oneflow-opt.cpp
│   │   ├── oneflow-runner/
│   │   │   ├── CMakeLists.txt
│   │   │   └── oneflow-runner.cpp
│   │   ├── oneflow-runtime/
│   │   │   ├── CMakeLists.txt
│   │   │   └── lib/
│   │   │       ├── CMakeLists.txt
│   │   │       └── Runtime.cpp
│   │   ├── oneflow-translate/
│   │   │   ├── CMakeLists.txt
│   │   │   ├── README.md
│   │   │   ├── include/
│   │   │   │   ├── CMakeLists.txt
│   │   │   │   └── OneFlow/
│   │   │   │       ├── CMakeLists.txt
│   │   │   │       └── MLIROneFlowTranslation.h
│   │   │   ├── lib/
│   │   │   │   ├── CMakeLists.txt
│   │   │   │   └── OneFlow/
│   │   │   │       ├── CMakeLists.txt
│   │   │   │       ├── Importer.cpp
│   │   │   │       └── MLIROneFlowTranslation.cpp
│   │   │   └── oneflow-translate.cpp
│   │   └── test/
│   │       ├── CMakeLists.txt
│   │       ├── Frontend/
│   │       │   ├── lit.local.cfg
│   │       │   ├── oneflow_to_iree.mlir
│   │       │   └── tosa_to_elf.mlir
│   │       ├── GPU/
│   │       │   ├── lit.local.cfg
│   │       │   └── nvvm_to_cubin.mlir
│   │       ├── OneFlow/
│   │       │   ├── auto_nhwc/
│   │       │   │   ├── lit.local.cfg
│   │       │   │   ├── test_nhwc_batchnorm_relu.py
│   │       │   │   ├── test_nhwc_bias_add.py
│   │       │   │   ├── test_nhwc_conv.py
│   │       │   │   ├── test_nhwc_conv2d_maxpool2d.py
│   │       │   │   ├── test_nhwc_conv_relu_add.py
│   │       │   │   ├── test_nhwc_lenet.py
│   │       │   │   ├── test_nhwc_maxpool_2d.py
│   │       │   │   ├── test_nhwc_resnet.py
│   │       │   │   ├── test_nhwc_transpose_eliminate.py
│   │       │   │   └── test_resnet101_benchmark.py
│   │       │   ├── conversion/
│   │       │   │   ├── lower_to_tosa.mlir
│   │       │   │   ├── lower_to_tosa_signed.mlir
│   │       │   │   └── oneflow_to_tosa.mlir
│   │       │   ├── cse.mlir
│   │       │   ├── cuda_code_gen/
│   │       │   │   ├── gpu_copy_arg.mlir
│   │       │   │   ├── lit.local.cfg
│   │       │   │   ├── test_append_oneflow_stream.mlir
│   │       │   │   ├── test_cast_ops_to_signless.mlir
│   │       │   │   ├── test_fold_alloc_to_subview.mlir
│   │       │   │   ├── test_fuser_cast_scale.py
│   │       │   │   ├── test_gpu_all_reduce.mlir
│   │       │   │   ├── test_insert_ofmempool.mlir
│   │       │   │   ├── test_matmul.py
│   │       │   │   ├── test_mgpu_to_oneflow_stream.mlir
│   │       │   │   └── tosa_to_linalg.mlir
│   │       │   ├── folding/
│   │       │   │   ├── test_conv_bn.py
│   │       │   │   └── test_simple_multiply.py
│   │       │   ├── fuse/
│   │       │   │   ├── fuse_forward_ops.mlir
│   │       │   │   ├── test_cast_optimal_pass.py
│   │       │   │   └── test_fuse_pad_conv.py
│   │       │   ├── group_matmul.mlir
│   │       │   ├── jit_outline_func.mlir
│   │       │   ├── kernel_launch/
│   │       │   │   ├── OKLPass/
│   │       │   │   │   ├── lower_launcher_to_llvm_ptr.mlir
│   │       │   │   │   ├── lower_okl_to_llvm_call.mlir
│   │       │   │   │   └── tag_cuda_graph_support.mlir
│   │       │   │   ├── OKMPass/
│   │       │   │   │   ├── extract_okm_tensor.mlir
│   │       │   │   │   ├── okm_to_okl.mlir
│   │       │   │   │   ├── opt_okm_memref.mlir
│   │       │   │   │   └── wrap_okm_kernel.mlir
│   │       │   │   ├── OneFlowPass/
│   │       │   │   │   ├── aggregate_compute_ops.mlir
│   │       │   │   │   └── wrap_ops_to_kernel_launch/
│   │       │   │   │       ├── cuda_graph.mlir
│   │       │   │   │       ├── lit.local.cfg
│   │       │   │   │       └── simple.mlir
│   │       │   │   └── test_resnet.py
│   │       │   ├── networks/
│   │       │   │   ├── __init__.py
│   │       │   │   └── resnet50.py
│   │       │   ├── oneflow-opt.mlir
│   │       │   ├── oneflow-translate.mlir
│   │       │   ├── psig/
│   │       │   │   ├── error_parse.mlir
│   │       │   │   ├── sbp_parse.mlir
│   │       │   │   ├── test_2nd_basic_parse.py
│   │       │   │   └── test_basic_parse.py
│   │       │   ├── traits.mlir
│   │       │   └── with_cuda/
│   │       │       ├── lit.local.cfg
│   │       │       ├── test_conv_bn_auto_nhwc.py
│   │       │       ├── test_fuse_bias_add_dropout.py
│   │       │       ├── test_fuse_bias_add_gelu.py
│   │       │       ├── test_fuse_bn_add_relu.py
│   │       │       ├── test_fuse_gelu.py
│   │       │       ├── test_fuse_scale_tril.py
│   │       │       ├── test_fused_matmul_bias.py
│   │       │       ├── test_fused_multi_head_attention_inference.py
│   │       │       └── test_graph_save_and_load.py
│   │       ├── Transform/
│   │       │   ├── lit.local.cfg
│   │       │   ├── matmul.mlir
│   │       │   ├── softmax.mlir
│   │       │   ├── softmax_codegen_spec.mlir
│   │       │   ├── softmax_codegen_spec_no_vectorize.mlir
│   │       │   └── test_dialect.mlir
│   │       ├── lit.cfg.py
│   │       └── lit.site.cfg.py.in
│   ├── maybe/
│   │   ├── config.h
│   │   ├── error.h
│   │   ├── error_test.cpp
│   │   ├── just.h
│   │   ├── just_test.cpp
│   │   ├── maybe.h
│   │   ├── maybe_test.cpp
│   │   ├── optional.h
│   │   ├── optional_test.cpp
│   │   ├── type_traits.h
│   │   ├── type_traits_test.cpp
│   │   ├── utility.h
│   │   ├── utility_test.cpp
│   │   ├── variant.h
│   │   └── variant_test.cpp
│   └── user/
│       ├── data/
│       │   ├── batch_dataset.h
│       │   ├── batch_random_shuffle_dataset.h
│       │   ├── coco_data_reader.cpp
│       │   ├── coco_data_reader.h
│       │   ├── coco_dataset.cpp
│       │   ├── coco_dataset.h
│       │   ├── coco_parser.cpp
│       │   ├── coco_parser.h
│       │   ├── data_reader.h
│       │   ├── dataset.h
│       │   ├── distributed_training_dataset.h
│       │   ├── distributed_util.h
│       │   ├── gpt_dataset.cpp
│       │   ├── gpt_dataset.h
│       │   ├── group_batch_dataset.h
│       │   ├── ofrecord_data_reader.h
│       │   ├── ofrecord_dataset.h
│       │   ├── ofrecord_image_classification_data_reader.h
│       │   ├── ofrecord_image_classification_dataset.cpp
│       │   ├── ofrecord_image_classification_dataset.h
│       │   ├── ofrecord_image_classification_parser.h
│       │   ├── ofrecord_parser.h
│       │   ├── parser.h
│       │   └── random_shuffle_dataset.h
│       ├── image/
│       │   ├── crop_window.h
│       │   ├── image_util.cpp
│       │   ├── image_util.h
│       │   ├── jpeg_decoder.cpp
│       │   ├── jpeg_decoder.h
│       │   ├── jpeg_decoder_test.cpp
│       │   ├── random_crop_generator.cpp
│       │   └── random_crop_generator.h
│       ├── kernels/
│       │   ├── acc_kernel.cpp
│       │   ├── activation_kernels.cpp
│       │   ├── adaptive_avg_pool_cpu_kernel.cpp
│       │   ├── adaptive_avg_pool_gpu_kernel.cu
│       │   ├── adaptive_max_pool_cpu_kernel.cpp
│       │   ├── adaptive_max_pool_gpu_kernel.cu
│       │   ├── adaptive_pool_kernel_util.h
│       │   ├── add_n_kernel.cpp
│       │   ├── affine_grid_kernel.cpp
│       │   ├── affine_grid_kernel.cu
│       │   ├── affine_grid_kernel.h
│       │   ├── arange_kernel.cpp
│       │   ├── arange_kernel_util.cpp
│       │   ├── arange_kernel_util.cu
│       │   ├── arange_kernel_util.h
│       │   ├── arg_sort_kernel.cpp
│       │   ├── arg_sort_kernel.cu
│       │   ├── arg_where_kernel.cpp
│       │   ├── arg_where_kernel_util.cpp
│       │   ├── arg_where_kernel_util.cu
│       │   ├── arg_where_kernel_util.h
│       │   ├── argmax_kernel.cpp
│       │   ├── argmax_kernel.cu
│       │   ├── as_strided_kernel.cpp
│       │   ├── as_strided_kernel.cu
│       │   ├── assign_if_kernel.cpp
│       │   ├── assign_if_kernel.cu
│       │   ├── assign_kernel.cpp
│       │   ├── avg_pool_kernel.cpp
│       │   ├── avg_pool_kernel.cu
│       │   ├── avg_pool_kernel_util.cpp
│       │   ├── avg_pool_kernel_util.h
│       │   ├── batch_gather_kernel.cpp
│       │   ├── batch_gather_kernel_util.cpp
│       │   ├── batch_gather_kernel_util.cu
│       │   ├── batch_gather_kernel_util.h
│       │   ├── batch_norm_backward_elemt_kernel.cu
│       │   ├── batch_norm_backward_reduce_kernel.cu
│       │   ├── batch_norm_elemt_kernel.cu
│       │   ├── batch_norm_gather_stats_with_counts_kernel.cu
│       │   ├── batch_norm_kernel_utils.h
│       │   ├── batch_norm_stats_kernel.cu
│       │   ├── bernoulli_kernel.cpp
│       │   ├── bias_add_kernel.cpp
│       │   ├── binary_concat_kernel.cu
│       │   ├── binary_cross_entropy_kernel.cpp
│       │   ├── binary_cross_entropy_kernel.cu
│       │   ├── binary_cross_entropy_with_logits_kernel.cpp
│       │   ├── binary_cross_entropy_with_logits_kernel.cu
│       │   ├── binary_cross_entropy_with_logits_mean_kernel.cu
│       │   ├── binary_cross_entropy_with_logits_mean_kernel_util.h
│       │   ├── binary_cross_entropy_with_logits_reduce_mean.cpp
│       │   ├── bincount_kernel.cpp
│       │   ├── bincount_kernel.cu
│       │   ├── broadcast_div_grad_kernel.cpp
│       │   ├── broadcast_like_kernel.cpp
│       │   ├── cast_kernel.cpp
│       │   ├── cast_to_static_shape_kernel.cpp
│       │   ├── categorical_ordinal_encode_kernel.cpp
│       │   ├── categorical_ordinal_encode_kernel_util.cpp
│       │   ├── categorical_ordinal_encode_kernel_util.cu
│       │   ├── categorical_ordinal_encode_kernel_util.h
│       │   ├── clip_by_value_kernel.cpp
│       │   ├── clip_by_value_kernel.cu
│       │   ├── clip_by_value_kernel.h
│       │   ├── coco_reader_kernel.cpp
│       │   ├── collective_communication/
│       │   │   ├── cpu/
│       │   │   │   ├── cpu_all_gather.cpp
│       │   │   │   ├── cpu_all_reduce.cpp
│       │   │   │   ├── cpu_broadcast.cpp
│       │   │   │   ├── cpu_collective_communication_util.h
│       │   │   │   ├── cpu_communication_context.cpp
│       │   │   │   ├── cpu_communication_context.h
│       │   │   │   ├── cpu_recv.cpp
│       │   │   │   ├── cpu_reduce.cpp
│       │   │   │   ├── cpu_reduce_scatter.cpp
│       │   │   │   └── cpu_send.cpp
│       │   │   ├── cuda/
│       │   │   │   ├── cuda_all_gather.cpp
│       │   │   │   ├── cuda_all_reduce.cpp
│       │   │   │   ├── cuda_all_to_all.cpp
│       │   │   │   ├── cuda_broadcast.cpp
│       │   │   │   ├── cuda_communication_context.cpp
│       │   │   │   ├── cuda_communication_context.h
│       │   │   │   ├── cuda_recv.cpp
│       │   │   │   ├── cuda_reduce.cpp
│       │   │   │   ├── cuda_reduce_scatter.cpp
│       │   │   │   ├── cuda_send.cpp
│       │   │   │   ├── cuda_send_recv_util.cpp
│       │   │   │   └── cuda_send_recv_util.h
│       │   │   └── include/
│       │   │       ├── all_gather.h
│       │   │       ├── all_reduce.h
│       │   │       ├── all_to_all.h
│       │   │       ├── broadcast.h
│       │   │       ├── collective_communication.h
│       │   │       ├── communication_context.h
│       │   │       ├── recv.h
│       │   │       ├── reduce.h
│       │   │       ├── reduce_scatter.h
│       │   │       └── send.h
│       │   ├── combined_margin_loss_kernel.cpp
│       │   ├── combined_margin_loss_kernel.cu
│       │   ├── communicate_util.cpp
│       │   ├── communicate_util.h
│       │   ├── complex_kernels.cpp
│       │   ├── concat_kernel.cpp
│       │   ├── constant_kernel.cpp
│       │   ├── conv_cudnn_kernels.cpp
│       │   ├── conv_cutlass_kernels.cu
│       │   ├── conv_kernels.cpp
│       │   ├── convert_memory_format_kernel.cpp
│       │   ├── convert_memory_format_util.cpp
│       │   ├── convert_memory_format_util.h
│       │   ├── copy_data_content_kernel.cpp
│       │   ├── copy_hd_kernel.cpp
│       │   ├── copy_kernel.cpp
│       │   ├── count_not_finite_kernel.cpp
│       │   ├── count_not_finite_kernel.cu
│       │   ├── ctc_greedy_decoder.cpp
│       │   ├── ctc_greedy_decoder.cu
│       │   ├── ctc_greedy_decoder.h
│       │   ├── ctc_loss_kernel.cpp
│       │   ├── ctc_loss_kernel_util.cpp
│       │   ├── ctc_loss_kernel_util.cu
│       │   ├── ctc_loss_kernel_util.h
│       │   ├── cublas_bias_add_relu_matmul_grad_kernel.cu
│       │   ├── cublas_fused_matmul_bias_add_grad.cu
│       │   ├── cublas_fused_mlp_grad_kernel.cu
│       │   ├── cublas_fused_mlp_kernel.cu
│       │   ├── cublas_fused_mlp_util.cuh
│       │   ├── cufft_plan_cache.h
│       │   ├── cum_backward_kernel.cpp
│       │   ├── cum_backward_kernel.cu
│       │   ├── cum_forward_kernel.cpp
│       │   ├── cum_forward_kernel.cu
│       │   ├── cutlass_conv_tuner.cpp
│       │   ├── cutlass_conv_tuner.h
│       │   ├── data_shuffle_kernel.cu
│       │   ├── deconv_cpu_kernel.cpp
│       │   ├── deconv_cudnn_kernel.cpp
│       │   ├── deform_conv_kernel.cpp
│       │   ├── deform_conv_kernel.cu
│       │   ├── det_kernel.cpp
│       │   ├── diag_kernel.cpp
│       │   ├── diag_kernel.cu
│       │   ├── diag_kernel.h
│       │   ├── diagonal_kernel.cpp
│       │   ├── diagonal_kernel.cu
│       │   ├── dim_gather_kernel_util.cpp
│       │   ├── dim_gather_kernel_util.cu
│       │   ├── dim_gather_kernel_util.h
│       │   ├── dim_gather_kernels.cpp
│       │   ├── dim_scatter_kernel_util.cpp
│       │   ├── dim_scatter_kernel_util.cu
│       │   ├── dim_scatter_kernel_util.h
│       │   ├── dim_scatter_kernels.cpp
│       │   ├── dim_scatter_scalar_kernel_util.cpp
│       │   ├── dim_scatter_scalar_kernel_util.cu
│       │   ├── dim_scatter_scalar_kernel_util.h
│       │   ├── dim_scatter_scalar_kernels.cpp
│       │   ├── distributions/
│       │   │   ├── common.h
│       │   │   ├── distribution_template_util.cuh
│       │   │   ├── exponential_distribution.cpp
│       │   │   ├── exponential_distribution.cu
│       │   │   ├── exponential_distribution.h
│       │   │   ├── exponential_kernel.cpp
│       │   │   ├── exponential_kernel.h
│       │   │   ├── multinomial_with_replacement_kernel.cpp
│       │   │   ├── multinomial_with_replacement_kernel.cu
│       │   │   ├── normal_distribution.cpp
│       │   │   ├── normal_distribution.cu
│       │   │   ├── normal_distribution.h
│       │   │   ├── normal_kernel.cpp
│       │   │   ├── normal_kernel.h
│       │   │   ├── uniform_distribution.cpp
│       │   │   ├── uniform_distribution.cu
│       │   │   ├── uniform_distribution.h
│       │   │   ├── uniform_int_distribution.cpp
│       │   │   ├── uniform_int_distribution.cu
│       │   │   ├── uniform_int_distribution.h
│       │   │   ├── uniform_int_kernel.cpp
│       │   │   ├── uniform_int_kernel.h
│       │   │   ├── uniform_kernel.cpp
│       │   │   └── uniform_kernel.h
│       │   ├── dot_kernel.cpp
│       │   ├── dropout_kernel.cpp
│       │   ├── dropout_kernel.cu
│       │   ├── dropout_kernel.h
│       │   ├── dynamic_loss_scale_schedule_kernel.cpp
│       │   ├── dynamic_loss_scale_schedule_kernel.cu
│       │   ├── eager_b_to_s_kernel.cpp
│       │   ├── eager_ccl_kernel.cpp
│       │   ├── eager_nccl_s2s_kernel.cu
│       │   ├── eager_p_to_b_kernel.cpp
│       │   ├── eager_p_to_s_kernel.cpp
│       │   ├── eager_s_to_b_kernel.cpp
│       │   ├── eager_s_to_p_kernel.cpp
│       │   ├── eager_s_to_s_kernel.cpp
│       │   ├── eager_symmetric_s_to_p_kernel.cpp
│       │   ├── elementwise_maximum_minimum_kernel.cpp
│       │   ├── elementwise_maximum_minimum_kernel.cu
│       │   ├── elementwise_maximum_minimum_kernel.h
│       │   ├── elementwise_primitive_kernel.h
│       │   ├── embedding_kernel.cpp
│       │   ├── embedding_kernel.cu
│       │   ├── embedding_kernel_util.cpp
│       │   ├── embedding_kernel_util.cu
│       │   ├── embedding_kernel_util.h
│       │   ├── empty_kernel.cpp
│       │   ├── erfinv_kernel.cpp
│       │   ├── erfinv_kernel.cu
│       │   ├── expand_kernel.cpp
│       │   ├── eye_kernel.cpp
│       │   ├── eye_kernel_util.cpp
│       │   ├── eye_kernel_util.cu
│       │   ├── eye_kernel_util.h
│       │   ├── fake_quantization_kernel.cpp
│       │   ├── fake_quantization_kernel.cu
│       │   ├── fft_kernel_util.cpp
│       │   ├── fft_kernel_util.cu
│       │   ├── fft_kernel_util.h
│       │   ├── fft_kernels.cpp
│       │   ├── fill_kernel.cpp
│       │   ├── fill_kernel.cu
│       │   ├── flip_kernel.cpp
│       │   ├── flip_kernel.cu
│       │   ├── fold_kernel.cpp
│       │   ├── fold_kernel_util.cpp
│       │   ├── fold_kernel_util.cu
│       │   ├── fold_kernel_util.h
│       │   ├── frac_kernel.cpp
│       │   ├── frac_kernel.cu
│       │   ├── fused_attention_kernels.cu
│       │   ├── fused_bias_add_kernel.cu
│       │   ├── fused_bias_add_scale_mask_softmax_dropout.cu
│       │   ├── fused_cast_scale_kernel.cpp
│       │   ├── fused_cast_scale_kernel.cu
│       │   ├── fused_center_kernel.cu
│       │   ├── fused_clip_grad.cu
│       │   ├── fused_clip_grad.h
│       │   ├── fused_clip_grad_util.h
│       │   ├── fused_codegeex_qkv_reshape_kernel.cu
│       │   ├── fused_cross_feature_interaction.cu
│       │   ├── fused_cross_feature_interaction_grad.cu
│       │   ├── fused_dot_feature_interaction_kernel.cu
│       │   ├── fused_gelu_mul_kernel.cu
│       │   ├── fused_get_bounding_boxes_coord_kernel.cu
│       │   ├── fused_get_ciou_diagonal_angle_kernel.cu
│       │   ├── fused_get_ciou_result_kernel.cu
│       │   ├── fused_get_convex_diagonal_squared_kernel.cu
│       │   ├── fused_get_intersection_area_kernel.cu
│       │   ├── fused_get_iou_kernel.cu
│       │   ├── fused_glu_kernel.cu
│       │   ├── fused_glu_without_linear_grad_kernel.cu
│       │   ├── fused_gru_cell_kernel.cu
│       │   ├── fused_lstm_cell_kernel.cu
│       │   ├── fused_matmul_bias_add_relu_dropout.cu
│       │   ├── fused_matmul_bias_kernel.cu
│       │   ├── fused_relu_dropout_grad_kernel.cu
│       │   ├── fused_rnn_cell_kernel_util.h
│       │   ├── fused_scale_mask_bias_softmax.cu
│       │   ├── fused_scale_mask_softmax.cu
│       │   ├── fused_scale_mask_softmax_dropout.cu
│       │   ├── fused_self_attention_query_mul_key_and_value_kernel.cu
│       │   ├── fused_softmax.cuh
│       │   ├── fused_tril_scale_softmax_mask_scale_kernel.cu
│       │   ├── fused_weighted_sum_kernel.cpp
│       │   ├── fused_weighted_sum_kernel.cu
│       │   ├── gather_kernel.cpp
│       │   ├── gather_kernel_util.cpp
│       │   ├── gather_kernel_util.cu
│       │   ├── gather_kernel_util.h
│       │   ├── generate_random_batch_permutation_indices_kernel.cpp
│       │   ├── generate_random_batch_permutation_indices_kernel.cu
│       │   ├── gpt_data_loader_kernel.cpp
│       │   ├── greater_inplace_kernel.cpp
│       │   ├── greater_inplace_kernel_util.cpp
│       │   ├── greater_inplace_kernel_util.cu
│       │   ├── greater_inplace_kernel_util.h
│       │   ├── grid_sample_kernel.cpp
│       │   ├── grid_sample_kernel_util.cpp
│       │   ├── grid_sample_kernel_util.cu
│       │   ├── grid_sample_kernel_util.h
│       │   ├── group_conv_kernel.cpp
│       │   ├── group_deconv_kernel.cpp
│       │   ├── group_norm_kernel.cu
│       │   ├── grouped_matmul_bias.cu
│       │   ├── groupwise_quantization_kernels.cu
│       │   ├── host_scalar_add_by_tensor_kernel.cu
│       │   ├── image_batch_align_kernel.cpp
│       │   ├── image_decode_kernel.cpp
│       │   ├── image_object_preprocess_kernels.cpp
│       │   ├── image_preprocess_kernels.cpp
│       │   ├── image_preprocess_kernels.cu
│       │   ├── image_resize_kernels.cpp
│       │   ├── image_target_resize_kernel.cpp
│       │   ├── in_top_k_kernel.cpp
│       │   ├── in_top_k_kernel_util.cpp
│       │   ├── in_top_k_kernel_util.cu
│       │   ├── in_top_k_kernel_util.h
│       │   ├── index_add_kernel.cpp
│       │   ├── index_add_kernel.cu
│       │   ├── indexed_slices_reduce_sum_kernel.cpp
│       │   ├── indexed_slices_reduce_sum_kernel_util.cpp
│       │   ├── indexed_slices_reduce_sum_kernel_util.h
│       │   ├── inv_kernels.cpp
│       │   ├── inv_kernels.cu
│       │   ├── kl_div_kernel.cpp
│       │   ├── kl_div_kernel.cu
│       │   ├── l1_l2_regularize_gradient_kernel.cpp
│       │   ├── l1_l2_regularize_gradient_kernel_util.cpp
│       │   ├── l1_l2_regularize_gradient_kernel_util.cu
│       │   ├── l1_l2_regularize_gradient_kernel_util.h
│       │   ├── l2_normalize_kernel.cpp
│       │   ├── l2_normalize_kernel.cu
│       │   ├── layer_norm_cpu_kernel.cpp
│       │   ├── layer_norm_gpu_kernel.cu
│       │   ├── lerp_kernel.cpp
│       │   ├── lerp_kernel_util.cpp
│       │   ├── lerp_kernel_util.cu
│       │   ├── lerp_kernel_util.h
│       │   ├── linalg_cross_kernel.cpp
│       │   ├── linalg_cross_kernel.cu
│       │   ├── log_softmax_kernel.cpp
│       │   ├── logical_not_kernel.cpp
│       │   ├── loss_kernel_util.h
│       │   ├── lu_decomposition_kernel.cu
│       │   ├── masked_fill_kernel.cpp
│       │   ├── math_binary_broadcast_kernels.cpp
│       │   ├── math_binary_elementwise_func.h
│       │   ├── math_binary_elementwise_kernel.cpp
│       │   ├── math_binary_elementwise_kernel.cu
│       │   ├── math_unary_elementwise_func.h
│       │   ├── math_unary_elementwise_primitive_kernel.cpp
│       │   ├── matmul_kernels.cpp
│       │   ├── matrix_vector_product_kernel.cpp
│       │   ├── max_pool_kernel.cpp
│       │   ├── max_pool_kernel.cu
│       │   ├── max_pool_kernel_util.cpp
│       │   ├── max_pool_kernel_util.h
│       │   ├── max_unpool_kernel.cpp
│       │   ├── max_unpool_kernel.cu
│       │   ├── max_unpool_kernel_util.cpp
│       │   ├── max_unpool_kernel_util.h
│       │   ├── median_kernel.cpp
│       │   ├── median_kernel.cu
│       │   ├── median_with_indices_kernel.cpp
│       │   ├── median_with_indices_kernel.cu
│       │   ├── min_max_observer_kernel.cpp
│       │   ├── min_max_observer_kernel.cu
│       │   ├── mode_kernel.cpp
│       │   ├── model_update_kernel_util.cpp
│       │   ├── model_update_kernel_util.cu
│       │   ├── model_update_kernel_util.h
│       │   ├── model_update_kernels.cpp
│       │   ├── moving_average_min_max_observer_kernel.cpp
│       │   ├── moving_average_min_max_observer_kernel.cu
│       │   ├── multi_reduce_kernel_util.h
│       │   ├── multi_reduce_kernels.cpp
│       │   ├── multi_reduce_kernels.cu
│       │   ├── multi_reduce_kernels.h
│       │   ├── multi_tensor_model_update_kernel.cpp
│       │   ├── multi_tensor_model_update_kernel_util.cu
│       │   ├── multi_tensor_model_update_kernel_util.h
│       │   ├── mutable_cast_once_kernel.cpp
│       │   ├── narrow_kernel.cpp
│       │   ├── nccl_logical_2d_sbp_kernels.cpp
│       │   ├── nccl_logical_fusion_kernel.cpp
│       │   ├── nccl_logical_kernels.cpp
│       │   ├── nccl_logical_send_recv_kernel.cpp
│       │   ├── nd_index_slice_kernels.cpp
│       │   ├── nd_index_slice_kernels.cu
│       │   ├── nd_index_slice_kernels.h
│       │   ├── nd_index_slice_util.h
│       │   ├── nll_kernel.cpp
│       │   ├── nll_kernel_util.cpp
│       │   ├── nll_kernel_util.cu
│       │   ├── nll_kernel_util.h
│       │   ├── nms_kernel.cpp
│       │   ├── nms_kernel.cu
│       │   ├── noncontiguous_binary_op.cu
│       │   ├── nop_kernel.cpp
│       │   ├── normalization_kernel.cpp
│       │   ├── normalization_kernel.cu
│       │   ├── nvtx_range_kernel.cu
│       │   ├── ofrecord_decoder_kernels.cpp
│       │   ├── ofrecord_image_classification_reader_kernel.cpp
│       │   ├── ofrecord_reader_kernel.cpp
│       │   ├── one_embedding_data_shuffle.cuh
│       │   ├── one_embedding_embedding_gradient_shuffle_p2p_kernel.cu
│       │   ├── one_embedding_embedding_shuffle_p2p_kernel.cu
│       │   ├── one_embedding_id_shuffle_p2p_kernel.cu
│       │   ├── one_embedding_kernels.cu
│       │   ├── one_embedding_update_kernels.cu
│       │   ├── one_hot_kernel.cpp
│       │   ├── one_hot_kernel.cu
│       │   ├── ones_like_kernel.cpp
│       │   ├── op_kernel_wrapper.h
│       │   ├── p2p_comm_kernel.cpp
│       │   ├── pack_kernel.cpp
│       │   ├── pad_kernel.cpp
│       │   ├── partial_fc_sample_kernel.cu
│       │   ├── pocketfft_hdronly.h
│       │   ├── pocketfftplan.h
│       │   ├── prelu_kernel.cpp
│       │   ├── prelu_kernel.cu
│       │   ├── quantization_kernel.cpp
│       │   ├── quantization_kernel.cu
│       │   ├── radix_sort.cuh
│       │   ├── random_crop_kernel_state.cpp
│       │   ├── random_crop_kernel_state.h
│       │   ├── random_mask_generator.cpp
│       │   ├── random_mask_generator.cu
│       │   ├── random_mask_generator.h
│       │   ├── random_mask_like_kernel.cpp
│       │   ├── random_mask_like_kernel.h
│       │   ├── random_seed_util.cpp
│       │   ├── random_seed_util.h
│       │   ├── randperm_kernel.cpp
│       │   ├── randperm_kernel.cu
│       │   ├── raw_reader_kernel.cpp
│       │   ├── reduce_kernel.cpp
│       │   ├── reduce_like_kernels.cpp
│       │   ├── reflection_pad_kernels.cpp
│       │   ├── reflection_pad_kernels_util.cpp
│       │   ├── reflection_pad_kernels_util.cu
│       │   ├── reflection_pad_kernels_util.h
│       │   ├── repeat_interleave_kernel.cpp
│       │   ├── repeat_interleave_kernel.cu
│       │   ├── replication_pad_kernels.cpp
│       │   ├── replication_pad_kernels_util.cpp
│       │   ├── replication_pad_kernels_util.cu
│       │   ├── replication_pad_kernels_util.h
│       │   ├── rms_norm_gpu_kernel.cu
│       │   ├── roc_auc_score_kernel.cpp
│       │   ├── roi_align_kernel.cu
│       │   ├── roll_kernel.cpp
│       │   ├── roll_kernel.cu
│       │   ├── roll_kernel_utils.h
│       │   ├── rrelu_kernel.cpp
│       │   ├── rrelu_kernel.cu
│       │   ├── same_padding_kernel.cpp
│       │   ├── scalar_bitwise_kernels.cpp
│       │   ├── scalar_by_tensor_kernel.cpp
│       │   ├── scalar_logical_kernels.cpp
│       │   ├── scalar_math_kernels.cpp
│       │   ├── scaled_dot_product_attention_grad_kernel.cu
│       │   ├── scaled_dot_product_attention_kernel.cu
│       │   ├── scaled_dot_product_attention_kernel.h
│       │   ├── scaled_dot_product_attention_util.h
│       │   ├── search_sorted_kernel.cpp
│       │   ├── search_sorted_kernel.cu
│       │   ├── search_sorted_kernel_util.h
│       │   ├── sigmoid_cross_entropy_kernel.cpp
│       │   ├── sigmoid_cross_entropy_kernel.cu
│       │   ├── sigmoid_cross_entropy_kernel.h
│       │   ├── skip_layer_norm_kernel.cu
│       │   ├── skip_rms_norm_kernel.cu
│       │   ├── slice_kernel.cpp
│       │   ├── slice_util.cpp
│       │   ├── slice_util.cu
│       │   ├── slice_util.h
│       │   ├── smooth_l1_loss_kernel.cpp
│       │   ├── smooth_l1_loss_kernel.cu
│       │   ├── softmax_cross_entropy_kernel.cpp
│       │   ├── softmax_cross_entropy_kernel.cu
│       │   ├── softmax_cross_entropy_kernel.h
│       │   ├── softmax_kernel.cpp
│       │   ├── sort_kernel.cpp
│       │   ├── sort_kernel.cu
│       │   ├── sparse_cross_entropy_kernel.cpp
│       │   ├── sparse_cross_entropy_kernel_util.cpp
│       │   ├── sparse_cross_entropy_kernel_util.cu
│       │   ├── sparse_cross_entropy_kernel_util.h
│       │   ├── sparse_softmax_cross_entropy_kernel.cpp
│       │   ├── sparse_softmax_cross_entropy_kernel.cu
│       │   ├── sparse_softmax_cross_entropy_kernel_util.cpp
│       │   ├── sparse_softmax_cross_entropy_kernel_util.cu
│       │   ├── sparse_softmax_cross_entropy_kernel_util.h
│       │   ├── split_like_kernel.cpp
│       │   ├── sqrt_square_sum_kernel.cpp
│       │   ├── sqrt_square_sum_kernel_util.cpp
│       │   ├── sqrt_square_sum_kernel_util.cu
│       │   ├── sqrt_square_sum_kernel_util.h
│       │   ├── square_sum_kernel.cpp
│       │   ├── square_sum_kernel_util.cpp
│       │   ├── square_sum_kernel_util.cu
│       │   ├── square_sum_kernel_util.h
│       │   ├── ssp_variable_proxy_kernel.cpp
│       │   ├── stack_kernel.cpp
│       │   ├── stateful_opkernel.cpp
│       │   ├── stateful_opkernel.h
│       │   ├── summary_kernels.cpp
│       │   ├── tensor_buffer_kernels.cpp
│       │   ├── tensor_constant_kernel.cpp
│       │   ├── tf_pool_cpu_kernel.cpp
│       │   ├── tf_pool_gpu_kernel.cpp
│       │   ├── tf_prelu_kernel.cpp
│       │   ├── tf_prelu_kernel.cu
│       │   ├── throw_error_kernel.cpp
│       │   ├── to_contiguous_kernel.cpp
│       │   ├── to_contiguous_kernel.cu
│       │   ├── to_contiguous_kernel.h
│       │   ├── top_k_kernel.cpp
│       │   ├── top_k_kernel.cu
│       │   ├── transpose_kernel.cpp
│       │   ├── tril_kernel.cpp
│       │   ├── tril_kernel.cu
│       │   ├── triu_kernel.cpp
│       │   ├── triu_kernel.cu
│       │   ├── tuple_identity_kernel.cpp
│       │   ├── two_stage_reduce_kernel.cpp
│       │   ├── two_stage_reduce_kernel_util.cpp
│       │   ├── two_stage_reduce_kernel_util.cu
│       │   ├── two_stage_reduce_kernel_util.h
│       │   ├── unfold_kernel.cpp
│       │   ├── unfold_kernel_util.cpp
│       │   ├── unfold_kernel_util.cu
│       │   ├── unfold_kernel_util.h
│       │   ├── unfold_tensor_kernel.cpp
│       │   ├── unfold_tensor_kernel.cu
│       │   ├── unfold_tensor_kernel_utils.h
│       │   ├── unique_kernel.cpp
│       │   ├── unique_kernel_util.cpp
│       │   ├── unique_kernel_util.cu
│       │   ├── unique_kernel_util.h
│       │   ├── unique_with_counts_kernel.cpp
│       │   ├── unpack_kernel.cpp
│       │   ├── unsorted_batch_segment_sum_kernel.cpp
│       │   ├── unsorted_segment_sum_kernel.cpp
│       │   ├── unsorted_segment_sum_kernel_util.cpp
│       │   ├── unsorted_segment_sum_kernel_util.cu
│       │   ├── unsorted_segment_sum_kernel_util.h
│       │   ├── upsample_bicubic_2d_kernel.cpp
│       │   ├── upsample_bicubic_2d_kernel.cu
│       │   ├── upsample_bilinear_2d_kernel.cpp
│       │   ├── upsample_bilinear_2d_kernel.cu
│       │   ├── upsample_kernel.h
│       │   ├── upsample_linear_1d_kernel.cpp
│       │   ├── upsample_linear_1d_kernel.cu
│       │   ├── upsample_nearest_kernel.cpp
│       │   ├── upsample_nearest_kernel.cu
│       │   ├── upsample_trilinear_3d_kernel.cpp
│       │   ├── upsample_trilinear_3d_kernel.cu
│       │   ├── util_ops_kernels.cpp
│       │   ├── variance_kernel.cpp
│       │   ├── variance_kernel_util.cpp
│       │   ├── variance_kernel_util.cu
│       │   ├── variance_kernel_util.h
│       │   ├── vector_matrix_product_kernel.cpp
│       │   ├── where_kernel.cpp
│       │   ├── where_kernel_util.cpp
│       │   ├── where_kernel_util.cu
│       │   ├── where_kernel_util.h
│       │   └── zero_like_kernel.cpp
│       ├── ops/
│       │   ├── acc_ctrl_tick_op.cpp
│       │   ├── acc_op.cpp
│       │   ├── adaptive_max_pool_op.cpp
│       │   ├── adaptive_pool_op.cpp
│       │   ├── add_n_op.cpp
│       │   ├── affine_grid_op.cpp
│       │   ├── amp_white_identity_op.cpp
│       │   ├── arange_op.cpp
│       │   ├── arg_sort_op.cpp
│       │   ├── arg_where_op.cpp
│       │   ├── argmax_op.cpp
│       │   ├── as_strided_op.cpp
│       │   ├── assign_op.cpp
│       │   ├── avg_pool_op.cpp
│       │   ├── batch_gather_op.cpp
│       │   ├── batch_norm_backward_elemt_op.cpp
│       │   ├── batch_norm_backward_reduce_op.cpp
│       │   ├── batch_norm_elemt_op.cpp
│       │   ├── batch_norm_gather_stats_with_counts_op.cpp
│       │   ├── batch_norm_stats_op.cpp
│       │   ├── bernoulli_op.cpp
│       │   ├── bias_add_op.cpp
│       │   ├── binary_cross_entropy_op.cpp
│       │   ├── binary_cross_entropy_with_logits_op.cpp
│       │   ├── binary_cross_entropy_with_logits_reduce_mean_op.cpp
│       │   ├── bincount_op.cpp
│       │   ├── broadcast_div_grad_op.cpp
│       │   ├── broadcast_like_op.cpp
│       │   ├── buffer_op.cpp
│       │   ├── cast_like_op.cpp
│       │   ├── cast_op.cpp
│       │   ├── cast_to_static_shape_op.cpp
│       │   ├── cast_to_tick_op.cpp
│       │   ├── categorical_ordinal_encode_op.cpp
│       │   ├── celu_op.cpp
│       │   ├── clip_by_value_op.cpp
│       │   ├── coco_reader_op.cpp
│       │   ├── combined_margin_loss_op.cpp
│       │   ├── comm_net_device_infer_util.cpp
│       │   ├── comm_net_device_infer_util.h
│       │   ├── complex_ops.cpp
│       │   ├── concat_op.cpp
│       │   ├── constant_op.cpp
│       │   ├── conv_op.cpp
│       │   ├── convert_memory_format_op.cpp
│       │   ├── convert_memory_format_op.h
│       │   ├── copy_hd_op.cpp
│       │   ├── copy_op.cpp
│       │   ├── count_not_finite_op.cpp
│       │   ├── ctc_loss_op.cpp
│       │   ├── cublas_bias_add_relu_matmul_grad_op.cpp
│       │   ├── cublas_fused_matmul_bias_add_grad_op.cpp
│       │   ├── cublas_fused_mlp_grad_op.cpp
│       │   ├── cublas_fused_mlp_op.cpp
│       │   ├── cum_ops.cpp
│       │   ├── data_shuffle_op.cpp
│       │   ├── deconv_op.cpp
│       │   ├── deform_conv_op.cpp
│       │   ├── depend_op.cpp
│       │   ├── det_op.cpp
│       │   ├── diag_op.cpp
│       │   ├── diagonal_op.cpp
│       │   ├── dim_gather_op.cpp
│       │   ├── dim_scatter_ops.cpp
│       │   ├── distributions/
│       │   │   ├── exponential_op.cpp
│       │   │   ├── multinomial_with_replacement_op.cpp
│       │   │   ├── normal_op.cpp
│       │   │   ├── uniform_int_op.cpp
│       │   │   └── uniform_op.cpp
│       │   ├── dot_op.cpp
│       │   ├── dropout_op.cpp
│       │   ├── dynamic_loss_scale_schedule_op.cpp
│       │   ├── eager_b_to_s_op.cpp
│       │   ├── eager_ccl_ops.cpp
│       │   ├── eager_p_to_b_op.cpp
│       │   ├── eager_p_to_s_op.cpp
│       │   ├── eager_s_to_b_op.cpp
│       │   ├── eager_s_to_p_op.cpp
│       │   ├── eager_s_to_s_op.cpp
│       │   ├── eager_symmetric_s_to_p_op.cpp
│       │   ├── elementwise_maximum_minimum_ops.cpp
│       │   ├── elu_op.cpp
│       │   ├── embedding_op.cpp
│       │   ├── empty_op.cpp
│       │   ├── erfinv_op.cpp
│       │   ├── expand_dims_op.cpp
│       │   ├── expand_op.cpp
│       │   ├── eye_op.cpp
│       │   ├── fake_quantization_op.cpp
│       │   ├── fft_ops.cpp
│       │   ├── fill_op.cpp
│       │   ├── flip_op.cpp
│       │   ├── frac_op.cpp
│       │   ├── fused_attention_ops.cpp
│       │   ├── fused_bias_add_op.cpp
│       │   ├── fused_bias_add_scale_mask_softmax_dropout_op.cpp
│       │   ├── fused_cast_scale_op.cpp
│       │   ├── fused_center_op.cpp
│       │   ├── fused_clip_grad_ops.cpp
│       │   ├── fused_codegeex_qkv_reshape.cpp
│       │   ├── fused_cross_feature_interaction_op.cpp
│       │   ├── fused_dot_feature_interaction_op.cpp
│       │   ├── fused_get_boundding_boxes_coord_op.cpp
│       │   ├── fused_get_ciou_diagonal_angle_op.cpp
│       │   ├── fused_get_ciou_result_op.cpp
│       │   ├── fused_get_convex_diagonal_squared_op.cpp
│       │   ├── fused_get_intersection_area_op.cpp
│       │   ├── fused_get_iou_op.cpp
│       │   ├── fused_glu_op.cpp
│       │   ├── fused_glu_without_linear_grad_op.cpp
│       │   ├── fused_gru_cell_op.cpp
│       │   ├── fused_linear_with_groupwise_quantized_weight_op.cpp
│       │   ├── fused_lstm_cell_op.cpp
│       │   ├── fused_matmul_bias_add_relu_dropout_op.cpp
│       │   ├── fused_matmul_bias_op.cpp
│       │   ├── fused_relu_dropout_grad_op.cpp
│       │   ├── fused_scale_mask_bias_softmax_op.cpp
│       │   ├── fused_scale_mask_softmax_dropout_op.cpp
│       │   ├── fused_scale_mask_softmax_op.cpp
│       │   ├── fused_scale_tril_softmax_mask_scale_op.cpp
│       │   ├── fused_self_attention_query_mul_key_and_value_ops.cpp
│       │   ├── fused_weighted_sum_op.cpp
│       │   ├── gather_op.cpp
│       │   ├── gelu_op.cpp
│       │   ├── generate_random_batch_permutation_indices_op.cpp
│       │   ├── gpt_data_loader_op.cpp
│       │   ├── greater_inplace_op.cpp
│       │   ├── grid_sample_op.cpp
│       │   ├── group_norm_op.cpp
│       │   ├── grouped_matmul_bias_op.cpp
│       │   ├── groupwise_dequantize_op.cpp
│       │   ├── hardshrink_op.cpp
│       │   ├── hardsigmoid_op.cpp
│       │   ├── hardswish_op.cpp
│       │   ├── hardtanh_op.cpp
│       │   ├── hierarchical_parallel_cast_op.cpp
│       │   ├── identity_op.cpp
│       │   ├── image_batch_align_op.cpp
│       │   ├── image_decode_op.cpp
│       │   ├── image_object_preprocess_ops.cpp
│       │   ├── image_preprocess_ops.cpp
│       │   ├── image_resize_ops.cpp
│       │   ├── image_target_resize_op.cpp
│       │   ├── in_top_k_op.cpp
│       │   ├── index_add_op.cpp
│       │   ├── indexed_slices_reduce_sum_op.cpp
│       │   ├── inv_op.cpp
│       │   ├── kl_div_op.cpp
│       │   ├── l1_l2_regularize_gradient_op.cpp
│       │   ├── l2_normalize_op.cpp
│       │   ├── layer_norm_op.cpp
│       │   ├── leaky_relu_op.cpp
│       │   ├── lerp_op.cpp
│       │   ├── linalg_cross_op.cpp
│       │   ├── log_softmax_op.cpp
│       │   ├── logical_not_op.cpp
│       │   ├── loss_op_util.cpp
│       │   ├── loss_op_util.h
│       │   ├── lu_composition_op.cpp
│       │   ├── masked_fill_op.cpp
│       │   ├── math_binary_broadcast_ops.cpp
│       │   ├── math_binary_broadcast_seq.h
│       │   ├── math_binary_elementwise_ops.cpp
│       │   ├── math_binary_elementwise_seq.h
│       │   ├── math_unary_elementwise_op.cpp
│       │   ├── math_unary_elementwise_seq.h
│       │   ├── matmul_op.cpp
│       │   ├── matrix_vector_product_op.cpp
│       │   ├── max_pool_op.cpp
│       │   ├── max_unpool_op.cpp
│       │   ├── median_op.cpp
│       │   ├── median_with_indices_op.cpp
│       │   ├── min_max_observer_op.cpp
│       │   ├── mish_op.cpp
│       │   ├── mode_op.cpp
│       │   ├── model_update_ops.cpp
│       │   ├── moving_average_min_max_observer_op.cpp
│       │   ├── multi_reduce_ops.cpp
│       │   ├── multi_tensor_model_update_ops.cpp
│       │   ├── mutable_cast_once_op.cpp
│       │   ├── narrow_op.cpp
│       │   ├── nccl_logical_2d_sbp_ops.cpp
│       │   ├── nccl_logical_fusion_op.cpp
│       │   ├── nccl_logical_ops.cpp
│       │   ├── nccl_logical_util.cpp
│       │   ├── nccl_logical_util.h
│       │   ├── nd_index_slice_ops.cpp
│       │   ├── nll_op.cpp
│       │   ├── nms_op.cpp
│       │   ├── nn_util.cpp
│       │   ├── nn_util.h
│       │   ├── noncontiguous_binary_op.cpp
│       │   ├── normalization_op.cpp
│       │   ├── nvtx_range_op.cpp
│       │   ├── ofrecord_decoder_ops.cpp
│       │   ├── ofrecord_image_classification_reader_op.cpp
│       │   ├── ofrecord_reader_op.cpp
│       │   ├── one_embedding_ops.cpp
│       │   ├── one_hot_op.cpp
│       │   ├── ones_like_op.cpp
│       │   ├── p2p_comm_op.cpp
│       │   ├── pack_op.cpp
│       │   ├── pad_op.cpp
│       │   ├── parallel_cast_op.cpp
│       │   ├── partial_fc_sample_op.cpp
│       │   ├── pinned_identity_op.cpp
│       │   ├── prelu_op.cpp
│       │   ├── quantization_op.cpp
│       │   ├── quick_gelu_op.cpp
│       │   ├── randperm_op.cpp
│       │   ├── raw_reader_op.cpp
│       │   ├── reduce_like_ops.cpp
│       │   ├── reduce_ops.cpp
│       │   ├── reflection_pad_op.cpp
│       │   ├── relu_op.cpp
│       │   ├── repeat_interleave_op.cpp
│       │   ├── repeat_op.cpp
│       │   ├── replication_pad_op.cpp
│       │   ├── reshape_like_op.cpp
│       │   ├── reshape_op.cpp
│       │   ├── reshape_user_op_util.cpp
│       │   ├── reshape_user_op_util.h
│       │   ├── reshape_user_op_util_test.cpp
│       │   ├── rms_norm_op.cpp
│       │   ├── roc_auc_score_op.cpp
│       │   ├── roi_align_op.cpp
│       │   ├── roll_op.cpp
│       │   ├── rrelu_op.cpp
│       │   ├── same_padding_op.cpp
│       │   ├── scalar_bitwise_op.cpp
│       │   ├── scalar_by_tensor_op.cpp
│       │   ├── scalar_logical_op.cpp
│       │   ├── scalar_math_op.cpp
│       │   ├── scaled_dot_product_flash_attention_op.cpp
│       │   ├── search_sorted_op.cpp
│       │   ├── selu_op.cpp
│       │   ├── sigmoid_cross_entropy_op.cpp
│       │   ├── silu_op.cpp
│       │   ├── skip_layer_norm_op.cpp
│       │   ├── skip_rms_norm_op.cpp
│       │   ├── slice_op.cpp
│       │   ├── smooth_l1_loss_op.cpp
│       │   ├── softmax_cross_entropy_op.cpp
│       │   ├── softmax_op.cpp
│       │   ├── softplus_op.cpp
│       │   ├── softshrink_op.cpp
│       │   ├── softsign_op.cpp
│       │   ├── sort_op.cpp
│       │   ├── sparse_cross_entropy_op.cpp
│       │   ├── sparse_softmax_cross_entropy_op.cpp
│       │   ├── split_like_op.cpp
│       │   ├── sqrt_square_sum_op.cpp
│       │   ├── square_relu_op.cpp
│       │   ├── square_sum_op.cpp
│       │   ├── squeeze_op.cpp
│       │   ├── ssp_variable_proxy_op.cpp
│       │   ├── stack_op.cpp
│       │   ├── stft_op.cpp
│       │   ├── summary_ops.cpp
│       │   ├── tanh_op.cpp
│       │   ├── tensor_buffer_ops.cpp
│       │   ├── tensor_constant_op.cpp
│       │   ├── tf_pool_op.cpp
│       │   ├── tf_prelu_op.cpp
│       │   ├── threshold_op.cpp
│       │   ├── throw_error_op.cpp
│       │   ├── to_contiguous_op.cpp
│       │   ├── top_k_op.cpp
│       │   ├── transpose_ops.cpp
│       │   ├── tril_op.cpp
│       │   ├── triu_op.cpp
│       │   ├── trunc_op.cpp
│       │   ├── tuple_identity_op.cpp
│       │   ├── two_stage_reduce_ops.cpp
│       │   ├── unfold_fold_op.cpp
│       │   ├── unfold_tensor_op.cpp
│       │   ├── unique_op.cpp
│       │   ├── unique_with_counts_op.cpp
│       │   ├── unpack_op.cpp
│       │   ├── unsorted_batch_segment_sum_op.cpp
│       │   ├── unsorted_segment_sum_op.cpp
│       │   ├── upsample_op.cpp
│       │   ├── util_ops.cpp
│       │   ├── variance_op.cpp
│       │   ├── vector_matrix_product_op.cpp
│       │   ├── where_op.cpp
│       │   └── zero_like_op.cpp
│       ├── summary/
│       │   ├── crc32c.h
│       │   ├── env_time.h
│       │   ├── event_writer_helper.cpp
│       │   ├── event_writer_helper.h
│       │   ├── events_writer.cpp
│       │   ├── events_writer.h
│       │   ├── histogram.cpp
│       │   ├── histogram.h
│       │   ├── plan_to_physical_graph.cpp
│       │   ├── plan_to_physical_graph.h
│       │   └── summary_converter.h
│       └── utils/
│           ├── pool_util.cpp
│           └── pool_util.h
├── python/
│   ├── .gitignore
│   ├── oneflow/
│   │   ├── _C/
│   │   │   ├── __init__.py
│   │   │   └── _nn.py
│   │   ├── __init__.py
│   │   ├── __main__.py
│   │   ├── _dynamo/
│   │   │   └── __init__.py
│   │   ├── _utils.py
│   │   ├── amp/
│   │   │   ├── __init__.py
│   │   │   ├── autocast_mode.py
│   │   │   └── grad_scaler.py
│   │   ├── ao/
│   │   │   └── quantization.py
│   │   ├── asyncs/
│   │   │   ├── __init__.py
│   │   │   └── thread.py
│   │   ├── autograd/
│   │   │   ├── __init__.py
│   │   │   ├── autograd.py
│   │   │   ├── autograd_function.py
│   │   │   ├── autograd_mode.py
│   │   │   ├── functional.py
│   │   │   ├── graph.py
│   │   │   └── profiler.py
│   │   ├── autoprof/
│   │   │   ├── __init__.py
│   │   │   ├── __main__.py
│   │   │   └── util.py
│   │   ├── backends/
│   │   │   ├── __init__.py
│   │   │   ├── cuda/
│   │   │   │   └── __init__.py
│   │   │   ├── cudnn/
│   │   │   │   └── __init__.py
│   │   │   └── mps/
│   │   │       └── __init__.py
│   │   ├── boxing/
│   │   │   ├── __init__.py
│   │   │   └── nccl/
│   │   │       └── __init__.py
│   │   ├── comm/
│   │   │   ├── __init__.py
│   │   │   └── comm_ops.py
│   │   ├── cuda/
│   │   │   ├── __init__.py
│   │   │   ├── _utils.py
│   │   │   ├── amp/
│   │   │   │   ├── __init__.py
│   │   │   │   └── autocast_mode.py
│   │   │   ├── random.py
│   │   │   └── type_tensor.py
│   │   ├── data.py
│   │   ├── distributed/
│   │   │   ├── __init__.py
│   │   │   ├── constants.py
│   │   │   └── launch.py
│   │   ├── distributions/
│   │   │   ├── __init__.py
│   │   │   ├── categorical.py
│   │   │   ├── distribution.py
│   │   │   └── utils.py
│   │   ├── env.py
│   │   ├── experimental/
│   │   │   └── load_mnist.py
│   │   ├── fft/
│   │   │   └── __init__.py
│   │   ├── framework/
│   │   │   ├── __init__.py
│   │   │   ├── args_tree.py
│   │   │   ├── attr_util.py
│   │   │   ├── balanced_splitter.py
│   │   │   ├── c_api_util.py
│   │   │   ├── check_point_v2.py
│   │   │   ├── config_util.py
│   │   │   ├── distribute.py
│   │   │   ├── docstr/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── activation.py
│   │   │   │   ├── addcdiv.py
│   │   │   │   ├── amax.py
│   │   │   │   ├── amin.py
│   │   │   │   ├── arange.py
│   │   │   │   ├── argsort.py
│   │   │   │   ├── array_ops.py
│   │   │   │   ├── as_tensor.py
│   │   │   │   ├── autograd.py
│   │   │   │   ├── baddbmm.py
│   │   │   │   ├── bitwise_ops.py
│   │   │   │   ├── bmm.py
│   │   │   │   ├── broadcast_like.py
│   │   │   │   ├── cast.py
│   │   │   │   ├── chunk.py
│   │   │   │   ├── clamp.py
│   │   │   │   ├── comm.py
│   │   │   │   ├── comparison.py
│   │   │   │   ├── constant.py
│   │   │   │   ├── conv.py
│   │   │   │   ├── convolution.py
│   │   │   │   ├── ctc_decode.py
│   │   │   │   ├── dataset.py
│   │   │   │   ├── deconv.py
│   │   │   │   ├── depend.py
│   │   │   │   ├── distance.py
│   │   │   │   ├── dropout.py
│   │   │   │   ├── einsum.py
│   │   │   │   ├── erfinv.py
│   │   │   │   ├── expand.py
│   │   │   │   ├── flatten.py
│   │   │   │   ├── flip.py
│   │   │   │   ├── hann_window.py
│   │   │   │   ├── in_top_k.py
│   │   │   │   ├── index_add.py
│   │   │   │   ├── index_select.py
│   │   │   │   ├── inv.py
│   │   │   │   ├── is_floating_point.py
│   │   │   │   ├── lerp.py
│   │   │   │   ├── linalg.py
│   │   │   │   ├── logaddexp.py
│   │   │   │   ├── logical_ops.py
│   │   │   │   ├── loss.py
│   │   │   │   ├── masked_fill.py
│   │   │   │   ├── math_ops.py
│   │   │   │   ├── meshgrid.py
│   │   │   │   ├── module.py
│   │   │   │   ├── nms.py
│   │   │   │   ├── nonzero.py
│   │   │   │   ├── norm.py
│   │   │   │   ├── normalization.py
│   │   │   │   ├── oneflow.py
│   │   │   │   ├── onehot.py
│   │   │   │   ├── pooling.py
│   │   │   │   ├── quantile.py
│   │   │   │   ├── random.py
│   │   │   │   ├── reduce_ops.py
│   │   │   │   ├── repeat.py
│   │   │   │   ├── repeat_interleave.py
│   │   │   │   ├── roc_auc_score.py
│   │   │   │   ├── searchsorted.py
│   │   │   │   ├── sort.py
│   │   │   │   ├── special_ops.py
│   │   │   │   ├── split.py
│   │   │   │   ├── swapaxes.py
│   │   │   │   ├── swapdims.py
│   │   │   │   ├── tensor.py
│   │   │   │   ├── tensor_attributes.py
│   │   │   │   ├── tensor_ops.py
│   │   │   │   ├── tensor_t.py
│   │   │   │   ├── tensordot.py
│   │   │   │   ├── tile.py
│   │   │   │   ├── topk.py
│   │   │   │   ├── trigonometric_ops.py
│   │   │   │   ├── unbind.py
│   │   │   │   ├── util_ops.py
│   │   │   │   ├── utils.py
│   │   │   │   ├── vision.py
│   │   │   │   └── where.py
│   │   │   ├── dtype.py
│   │   │   ├── env_util.py
│   │   │   ├── function_desc.py
│   │   │   ├── function_util.py
│   │   │   ├── generator.py
│   │   │   ├── graph_build_util.py
│   │   │   ├── hob.py
│   │   │   ├── id_util.py
│   │   │   ├── infer_compiler/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── import_tools/
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── format_utils.py
│   │   │   │   │   └── importer.py
│   │   │   │   ├── transform/
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── builtin_transform.py
│   │   │   │   │   ├── custom_transform.py
│   │   │   │   │   └── manager.py
│   │   │   │   ├── utils/
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── args_tree_util.py
│   │   │   │   │   ├── cost_util.py
│   │   │   │   │   ├── log_utils.py
│   │   │   │   │   ├── oneflow_exec_mode.py
│   │   │   │   │   ├── param_utils.py
│   │   │   │   │   ├── patch_for_compiler.py
│   │   │   │   │   └── patch_for_diffusers.py
│   │   │   │   ├── with_fx_graph.py
│   │   │   │   ├── with_fx_interpreter.py
│   │   │   │   ├── with_oneflow_backend.py
│   │   │   │   └── with_oneflow_compile.py
│   │   │   ├── job_set_util.py
│   │   │   ├── model.py
│   │   │   ├── multi_client_session.py
│   │   │   ├── register_class_method_util.py
│   │   │   ├── scope_util.py
│   │   │   ├── session_context.py
│   │   │   ├── sysconfig.py
│   │   │   ├── tensor.py
│   │   │   ├── tensor_str.py
│   │   │   ├── tensor_str_util.py
│   │   │   ├── tensor_tuple_util.py
│   │   │   ├── type_tensor.py
│   │   │   └── unittest.py
│   │   ├── fx/
│   │   │   └── __init__.py
│   │   ├── hub.py
│   │   ├── ir/
│   │   │   ├── __main__.py
│   │   │   ├── ast_gen_transformer.py
│   │   │   ├── bisect_transformer.py
│   │   │   ├── lr_jit.py
│   │   │   ├── math_params_transformer.py
│   │   │   └── self_params_transformer.py
│   │   ├── jit/
│   │   │   ├── __init__.py
│   │   │   └── annotations.py
│   │   ├── library.py
│   │   ├── linalg.py
│   │   ├── mock_torch/
│   │   │   ├── __init__.py
│   │   │   ├── __main__.py
│   │   │   ├── dyn_mock_mod.py
│   │   │   ├── mock_importer.py
│   │   │   ├── mock_modules.py
│   │   │   ├── mock_utils.py
│   │   │   └── torch/
│   │   │       └── __init__.py
│   │   ├── model.py
│   │   ├── multiprocessing/
│   │   │   ├── __init__.py
│   │   │   ├── _atfork.py
│   │   │   ├── pool.py
│   │   │   ├── queue.py
│   │   │   ├── reductions.py
│   │   │   ├── shared_memory/
│   │   │   │   └── __init__.py
│   │   │   └── spawn.py
│   │   ├── nn/
│   │   │   ├── __init__.py
│   │   │   ├── common_types.py
│   │   │   ├── functional/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── batch_norm.py
│   │   │   │   ├── ctc_loss.py
│   │   │   │   ├── deform_conv.py
│   │   │   │   ├── depend.py
│   │   │   │   ├── maxpool.py
│   │   │   │   ├── pad.py
│   │   │   │   └── softmax.py
│   │   │   ├── graph/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── cache.py
│   │   │   │   ├── graph.py
│   │   │   │   ├── graph_block.py
│   │   │   │   ├── graph_config.py
│   │   │   │   ├── optimizer.py
│   │   │   │   ├── proxy.py
│   │   │   │   └── util.py
│   │   │   ├── image.py
│   │   │   ├── init.py
│   │   │   ├── modules/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── _functions.py
│   │   │   │   ├── activation.py
│   │   │   │   ├── affine_grid.py
│   │   │   │   ├── all_reduce.py
│   │   │   │   ├── arange.py
│   │   │   │   ├── argsort.py
│   │   │   │   ├── argwhere.py
│   │   │   │   ├── as_tensor.py
│   │   │   │   ├── batchnorm.py
│   │   │   │   ├── batchnorm_fused.py
│   │   │   │   ├── broadcast_ops.py
│   │   │   │   ├── constant.py
│   │   │   │   ├── container.py
│   │   │   │   ├── conv.py
│   │   │   │   ├── dataset.py
│   │   │   │   ├── distance.py
│   │   │   │   ├── distributed_partial_fc_sample.py
│   │   │   │   ├── dropout.py
│   │   │   │   ├── einsum.py
│   │   │   │   ├── empty.py
│   │   │   │   ├── expand.py
│   │   │   │   ├── fake_quantization.py
│   │   │   │   ├── flatten.py
│   │   │   │   ├── fold.py
│   │   │   │   ├── fused_mlp.py
│   │   │   │   ├── global_cast.py
│   │   │   │   ├── grid_sample.py
│   │   │   │   ├── instancenorm.py
│   │   │   │   ├── interpolate.py
│   │   │   │   ├── is_tensor.py
│   │   │   │   ├── linear.py
│   │   │   │   ├── linspace.py
│   │   │   │   ├── logspace.py
│   │   │   │   ├── loss.py
│   │   │   │   ├── masked_select.py
│   │   │   │   ├── math_ops.py
│   │   │   │   ├── meshgrid.py
│   │   │   │   ├── min_max_observer.py
│   │   │   │   ├── module.py
│   │   │   │   ├── moving_average_min_max_observer.py
│   │   │   │   ├── nms.py
│   │   │   │   ├── nonzero.py
│   │   │   │   ├── norm.py
│   │   │   │   ├── normalization.py
│   │   │   │   ├── numel.py
│   │   │   │   ├── padding.py
│   │   │   │   ├── pixelshuffle.py
│   │   │   │   ├── pooling.py
│   │   │   │   ├── quantization.py
│   │   │   │   ├── reshape.py
│   │   │   │   ├── rnn.py
│   │   │   │   ├── roll.py
│   │   │   │   ├── scatter.py
│   │   │   │   ├── slice.py
│   │   │   │   ├── sparse.py
│   │   │   │   ├── sparse_softmax_cross_entropy.py
│   │   │   │   ├── tensor_buffer.py
│   │   │   │   ├── tensordot.py
│   │   │   │   ├── trigonometric_ops.py
│   │   │   │   ├── unique.py
│   │   │   │   ├── upsampling.py
│   │   │   │   ├── utils.py
│   │   │   │   └── where.py
│   │   │   ├── optimizer/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── adadelta.py
│   │   │   │   ├── adagrad.py
│   │   │   │   ├── adam.py
│   │   │   │   ├── adamw.py
│   │   │   │   ├── chained_scheduler.py
│   │   │   │   ├── constant_lr.py
│   │   │   │   ├── cosine_annealing_lr.py
│   │   │   │   ├── cosine_annealing_warm_restarts.py
│   │   │   │   ├── cosine_decay_lr.py
│   │   │   │   ├── exponential_lr.py
│   │   │   │   ├── lamb.py
│   │   │   │   ├── lambda_lr.py
│   │   │   │   ├── lbfgs.py
│   │   │   │   ├── linear_lr.py
│   │   │   │   ├── lr_scheduler.py
│   │   │   │   ├── multiplicative_lr.py
│   │   │   │   ├── multistep_lr.py
│   │   │   │   ├── polynomial_lr.py
│   │   │   │   ├── reduce_lr_on_plateau.py
│   │   │   │   ├── rmsprop.py
│   │   │   │   ├── sequential_lr.py
│   │   │   │   ├── sgd.py
│   │   │   │   ├── step_lr.py
│   │   │   │   ├── swa_utils.py
│   │   │   │   └── warmup_lr.py
│   │   │   ├── parallel/
│   │   │   │   ├── __init__.py
│   │   │   │   └── distributed.py
│   │   │   ├── parameter.py
│   │   │   ├── qat/
│   │   │   │   ├── __init__.py
│   │   │   │   └── conv.py
│   │   │   └── utils/
│   │   │       ├── __init__.py
│   │   │       ├── clip_grad.py
│   │   │       ├── container.py
│   │   │       ├── convert_parameters.py
│   │   │       ├── parameters_grouping.py
│   │   │       ├── prune.py
│   │   │       ├── rnn.py
│   │   │       ├── skip_init.py
│   │   │       └── weight_norm.py
│   │   ├── one_embedding.py
│   │   ├── onnx/
│   │   │   ├── __init__.py
│   │   │   └── symbolic_helper.py
│   │   ├── ops/
│   │   │   ├── __init__.py
│   │   │   ├── array_ops.py
│   │   │   ├── stateful_ops.py
│   │   │   ├── transpose_util.py
│   │   │   └── util/
│   │   │       ├── __init__.py
│   │   │       └── initializer_util.py
│   │   ├── optim/
│   │   │   ├── __init__.py
│   │   │   ├── lr_scheduler.py
│   │   │   ├── optimizer.py
│   │   │   └── swa_utils.py
│   │   ├── profiler/
│   │   │   ├── __init__.py
│   │   │   ├── events.py
│   │   │   ├── profiler.py
│   │   │   └── util.py
│   │   ├── remat/
│   │   │   └── __init__.py
│   │   ├── sbp.py
│   │   ├── special/
│   │   │   ├── __init__.py
│   │   │   └── special_ops.py
│   │   ├── support/
│   │   │   ├── __init__.py
│   │   │   ├── async_util.py
│   │   │   ├── box.py
│   │   │   ├── enable_if.py
│   │   │   ├── env_var_util.py
│   │   │   ├── func_inspect_util.py
│   │   │   ├── high_order_bool.py
│   │   │   ├── lazy.py
│   │   │   ├── pb_util.py
│   │   │   ├── scope_stack.py
│   │   │   └── traceinfo.py
│   │   ├── sysconfig.py
│   │   ├── test/
│   │   │   ├── README.md
│   │   │   ├── dataloader/
│   │   │   │   ├── data_utils.py
│   │   │   │   ├── test_cifar_dataset_multiprocess.py
│   │   │   │   ├── test_cifar_dataset_singleprocess.py
│   │   │   │   ├── test_fashion_mnist_dataset.py
│   │   │   │   ├── test_lenet.py
│   │   │   │   ├── test_mnist_dataset.py
│   │   │   │   ├── test_numpy_dataset.py
│   │   │   │   ├── test_tensor_dataset.py
│   │   │   │   └── test_transforms.py
│   │   │   ├── exceptions/
│   │   │   │   ├── test_activation.py
│   │   │   │   ├── test_add_n_op.py
│   │   │   │   ├── test_arg_sort_op.py
│   │   │   │   ├── test_array_functor.py
│   │   │   │   ├── test_autograd.py
│   │   │   │   ├── test_batch_gather_op.py
│   │   │   │   ├── test_bias_add_op.py
│   │   │   │   ├── test_binary_functor_exception.py
│   │   │   │   ├── test_bmm.py
│   │   │   │   ├── test_broadcast_ops.py
│   │   │   │   ├── test_chunk.py
│   │   │   │   ├── test_cosine_similarity.py
│   │   │   │   ├── test_deform_conv2d_op.py
│   │   │   │   ├── test_device.py
│   │   │   │   ├── test_dot.py
│   │   │   │   ├── test_error_reported_in_thread.py
│   │   │   │   ├── test_gird_sample_op.py
│   │   │   │   ├── test_global_branch_error_local_to_global_with_broadcast_sbp_1n2d.py
│   │   │   │   ├── test_global_branch_error_local_to_global_with_broadcast_sbp_1n4d.py
│   │   │   │   ├── test_global_branch_error_local_to_global_with_split_sbp.py
│   │   │   │   ├── test_global_branch_error_with_global_mean.py
│   │   │   │   ├── test_hann_window.py
│   │   │   │   ├── test_in_top_k.py
│   │   │   │   ├── test_inv.py
│   │   │   │   ├── test_layernorm.py
│   │   │   │   ├── test_linalg.py
│   │   │   │   ├── test_local_global_convert_error.py
│   │   │   │   ├── test_median.py
│   │   │   │   ├── test_mm.py
│   │   │   │   ├── test_mode.py
│   │   │   │   ├── test_multi_input_with_diff_device_or_placement.py
│   │   │   │   ├── test_mv.py
│   │   │   │   ├── test_nn_functor.py
│   │   │   │   ├── test_optim_add_param_group.py
│   │   │   │   ├── test_pad.py
│   │   │   │   ├── test_placement.py
│   │   │   │   ├── test_randperm_op.py
│   │   │   │   ├── test_reduce_like_ops.py
│   │   │   │   ├── test_reduce_ops.py
│   │   │   │   ├── test_repeat_interleave.py
│   │   │   │   ├── test_reshape.py
│   │   │   │   ├── test_reshape_like_op.py
│   │   │   │   ├── test_roi_align_op.py
│   │   │   │   ├── test_save_load.py
│   │   │   │   ├── test_saved_tensor_hooks.py
│   │   │   │   ├── test_slice_op.py
│   │   │   │   ├── test_smooth_l1_loss_op.py
│   │   │   │   ├── test_softmax_cross_entropy_op.py
│   │   │   │   ├── test_sparse_cross_entropy_op.py
│   │   │   │   ├── test_sparse_softmax_cross_entropy_op.py
│   │   │   │   ├── test_split_like_op.py
│   │   │   │   ├── test_stft_op.py
│   │   │   │   ├── test_tensor_index.py
│   │   │   │   ├── test_tensordot.py
│   │   │   │   ├── test_to_global_error.py
│   │   │   │   ├── test_view.py
│   │   │   │   └── throw_error.py
│   │   │   ├── expensive/
│   │   │   │   ├── README.md
│   │   │   │   ├── _internally_replaced_utils.py
│   │   │   │   ├── _test_remat.py
│   │   │   │   ├── pytorch_alexnet.py
│   │   │   │   ├── pytorch_convmixer.py
│   │   │   │   ├── pytorch_convnext.py
│   │   │   │   ├── pytorch_crossformer.py
│   │   │   │   ├── pytorch_densenet.py
│   │   │   │   ├── pytorch_efficientnet.py
│   │   │   │   ├── pytorch_ghostnet.py
│   │   │   │   ├── pytorch_googlenet.py
│   │   │   │   ├── pytorch_inception_v3.py
│   │   │   │   ├── pytorch_levit.py
│   │   │   │   ├── pytorch_mnasnet.py
│   │   │   │   ├── pytorch_poolformer.py
│   │   │   │   ├── pytorch_pvt.py
│   │   │   │   ├── pytorch_res2net.py
│   │   │   │   ├── pytorch_resmlp.py
│   │   │   │   ├── pytorch_resnet.py
│   │   │   │   ├── pytorch_rexnet.py
│   │   │   │   ├── pytorch_rexnetv1_lite.py
│   │   │   │   ├── pytorch_senet.py
│   │   │   │   ├── pytorch_shufflenetv2.py
│   │   │   │   ├── pytorch_squeezenet.py
│   │   │   │   ├── pytorch_swin_transformer.py
│   │   │   │   ├── pytorch_uniformer.py
│   │   │   │   ├── pytroch_mlp_mixer.py
│   │   │   │   ├── resnet50_model.py
│   │   │   │   ├── test_compatibility.py
│   │   │   │   ├── test_conv3d.py
│   │   │   │   ├── test_convtranspose.py
│   │   │   │   ├── test_dynamic_allocation_gradient_shuffle.py
│   │   │   │   ├── test_einsum.py
│   │   │   │   ├── test_global_tensor_offload.py
│   │   │   │   ├── test_graph_multi_graph_v2.py
│   │   │   │   ├── test_id_shuffle.py
│   │   │   │   ├── test_id_shuffle_global.py
│   │   │   │   ├── test_layernorm.py
│   │   │   │   ├── test_oneembedding.py
│   │   │   │   ├── test_oneembedding_padding_idx.py
│   │   │   │   ├── test_permute.py
│   │   │   │   ├── test_remat.py
│   │   │   │   ├── test_resnet50_with_bn.py
│   │   │   │   ├── test_resnet50_without_bn.py
│   │   │   │   ├── test_rnn.py
│   │   │   │   ├── test_rnn_cell.py
│   │   │   │   ├── test_rnn_pack_sequence.py
│   │   │   │   ├── test_rnn_utils.py
│   │   │   │   ├── test_sqrt_square_sum.py
│   │   │   │   ├── test_tensor_offload.py
│   │   │   │   ├── test_tensor_str.py
│   │   │   │   └── test_util.py
│   │   │   ├── gen_ops_process.py
│   │   │   ├── graph/
│   │   │   │   ├── alexnet_model.py
│   │   │   │   ├── ofrecord_data_utils.py
│   │   │   │   ├── optimizer_test_util.py
│   │   │   │   ├── test_alexnet_auto_parallel.py
│   │   │   │   ├── test_alexnet_graph.py
│   │   │   │   ├── test_comb1to2d.py
│   │   │   │   ├── test_comb2d.py
│   │   │   │   ├── test_forward_graph.py
│   │   │   │   ├── test_free_tensor_not_in_job.py
│   │   │   │   ├── test_fx_fuse.py
│   │   │   │   ├── test_fx_replace_ops.py
│   │   │   │   ├── test_fx_symbolic_trace_module.py
│   │   │   │   ├── test_gbc1to2d.py
│   │   │   │   ├── test_gbc2d.py
│   │   │   │   ├── test_gbc2to1d.py
│   │   │   │   ├── test_gbc2to2d.py
│   │   │   │   ├── test_graph.py
│   │   │   │   ├── test_graph_activation_checkpoint.py
│   │   │   │   ├── test_graph_arange.py
│   │   │   │   ├── test_graph_asymmetric_io.py
│   │   │   │   ├── test_graph_block.py
│   │   │   │   ├── test_graph_buffer_limit.py
│   │   │   │   ├── test_graph_clip_grad_norm.py
│   │   │   │   ├── test_graph_copy.py
│   │   │   │   ├── test_graph_debug.py
│   │   │   │   ├── test_graph_depend.py
│   │   │   │   ├── test_graph_eye.py
│   │   │   │   ├── test_graph_free_eager_tensor.py
│   │   │   │   ├── test_graph_grad_acc.py
│   │   │   │   ├── test_graph_image_gpu_decoder.py
│   │   │   │   ├── test_graph_inplace_add.py
│   │   │   │   ├── test_graph_io_check.py
│   │   │   │   ├── test_graph_linear.py
│   │   │   │   ├── test_graph_linear_train.py
│   │   │   │   ├── test_graph_loss.py
│   │   │   │   ├── test_graph_lr_scale.py
│   │   │   │   ├── test_graph_lr_scheduler.py
│   │   │   │   ├── test_graph_lr_with_warmup.py
│   │   │   │   ├── test_graph_lrs.py
│   │   │   │   ├── test_graph_masked_fill.py
│   │   │   │   ├── test_graph_nccl_logical_fusion.py
│   │   │   │   ├── test_graph_non_contiguous_tensors.py
│   │   │   │   ├── test_graph_normal_inplace.py
│   │   │   │   ├── test_graph_ofrecord_reader.py
│   │   │   │   ├── test_graph_optim_adadelta.py
│   │   │   │   ├── test_graph_optim_adagrad.py
│   │   │   │   ├── test_graph_optim_adam.py
│   │   │   │   ├── test_graph_optim_adamw.py
│   │   │   │   ├── test_graph_optim_ftrl.py
│   │   │   │   ├── test_graph_optim_lamb.py
│   │   │   │   ├── test_graph_optim_rmsprop.py
│   │   │   │   ├── test_graph_optim_sgd.py
│   │   │   │   ├── test_graph_optimizer.py
│   │   │   │   ├── test_graph_pipeline.py
│   │   │   │   ├── test_graph_pipeline_delay.py
│   │   │   │   ├── test_graph_random_seed.py
│   │   │   │   ├── test_graph_relu.py
│   │   │   │   ├── test_graph_reshape_acc.py
│   │   │   │   ├── test_graph_reuse_var.py
│   │   │   │   ├── test_graph_save_load.py
│   │   │   │   ├── test_graph_save_load_global_b_s.py
│   │   │   │   ├── test_graph_scalar.py
│   │   │   │   ├── test_graph_separate_compile.py
│   │   │   │   ├── test_graph_session_env_destruct.py
│   │   │   │   ├── test_graph_session_env_destruct1.py
│   │   │   │   ├── test_graph_sparse_optimizer.py
│   │   │   │   ├── test_graph_sparse_softmax_cross_entropy.py
│   │   │   │   ├── test_graph_tensor_clone.py
│   │   │   │   ├── test_graph_tensor_detach.py
│   │   │   │   ├── test_graph_with_global.py
│   │   │   │   ├── test_graph_zero.py
│   │   │   │   ├── test_input_op_expr.py
│   │   │   │   ├── test_long_add_n_pass.py
│   │   │   │   ├── test_modify_module_forward.py
│   │   │   │   ├── test_multi_client_session.py
│   │   │   │   ├── test_multi_graph.py
│   │   │   │   ├── test_multi_tensor_adam_update_with_cast.py
│   │   │   │   ├── test_multi_tensor_sgd_update_with_cast.py
│   │   │   │   ├── test_nccl_logical_send_recv.py
│   │   │   │   ├── test_neq_device_process_num.py
│   │   │   │   ├── test_oneflow_compiler.py
│   │   │   │   ├── test_optimization_conf.py
│   │   │   │   ├── test_output_op_expr.py
│   │   │   │   ├── test_run_global_graph_by_vm.py
│   │   │   │   ├── test_run_graph_by_vm.py
│   │   │   │   ├── test_to_global.py
│   │   │   │   ├── test_tvm_frontend_dependency_on_graph.py
│   │   │   │   ├── test_user_op_expr.py
│   │   │   │   ├── test_util.py
│   │   │   │   └── test_variable_op_expr.py
│   │   │   ├── misc/
│   │   │   │   ├── mock_example.py
│   │   │   │   ├── test_autograd_functional.py
│   │   │   │   ├── test_distributed_env_vars.py
│   │   │   │   ├── test_empty_cache.py
│   │   │   │   ├── test_env_cuda.py
│   │   │   │   ├── test_manual_seed_api.py
│   │   │   │   ├── test_mock_diffusers.py
│   │   │   │   ├── test_mock_scope.py
│   │   │   │   ├── test_np_dtype_converter.py
│   │   │   │   ├── test_placement.py
│   │   │   │   └── test_pybind11_caster.py
│   │   │   ├── modules/
│   │   │   │   ├── image_test_util.py
│   │   │   │   ├── optimizer_test_util.py
│   │   │   │   ├── save_load_test_data/
│   │   │   │   │   ├── 3x3_i3o3_conv2d/
│   │   │   │   │   │   ├── pickled_data
│   │   │   │   │   │   ├── tensor_3/
│   │   │   │   │   │   │   ├── meta
│   │   │   │   │   │   │   └── out
│   │   │   │   │   │   └── tensor_4/
│   │   │   │   │   │       ├── meta
│   │   │   │   │   │       └── out
│   │   │   │   │   └── 3x3_i3o3_conv2d_params/
│   │   │   │   │       ├── pickled_data
│   │   │   │   │       ├── tensor_5/
│   │   │   │   │       │   ├── meta
│   │   │   │   │       │   └── out
│   │   │   │   │       └── tensor_6/
│   │   │   │   │           ├── meta
│   │   │   │   │           └── out
│   │   │   │   ├── sync_batchnorm_test_util.py
│   │   │   │   ├── test_0_dim_tensor.py
│   │   │   │   ├── test_TripletMarginLoss.py
│   │   │   │   ├── test_abs.py
│   │   │   │   ├── test_activation.py
│   │   │   │   ├── test_adaptive_max_pool.py
│   │   │   │   ├── test_adaptive_pool.py
│   │   │   │   ├── test_adaptive_pool_fp16.py
│   │   │   │   ├── test_add.py
│   │   │   │   ├── test_addcdiv.py
│   │   │   │   ├── test_addcmul.py
│   │   │   │   ├── test_addmm.py
│   │   │   │   ├── test_affine_grid.py
│   │   │   │   ├── test_allclose.py
│   │   │   │   ├── test_allreduce.py
│   │   │   │   ├── test_amax.py
│   │   │   │   ├── test_amin.py
│   │   │   │   ├── test_arange.py
│   │   │   │   ├── test_argmax.py
│   │   │   │   ├── test_argmin.py
│   │   │   │   ├── test_argsort.py
│   │   │   │   ├── test_argwhere.py
│   │   │   │   ├── test_as_strided.py
│   │   │   │   ├── test_as_tensor.py
│   │   │   │   ├── test_asyncs_thread.py
│   │   │   │   ├── test_atleast.py
│   │   │   │   ├── test_auto_to_global.py
│   │   │   │   ├── test_autograd.py
│   │   │   │   ├── test_autograd_function.py
│   │   │   │   ├── test_autograd_mode.py
│   │   │   │   ├── test_avgpool.py
│   │   │   │   ├── test_baddbmm.py
│   │   │   │   ├── test_batch_gather.py
│   │   │   │   ├── test_batchnorm.py
│   │   │   │   ├── test_batchnorm_add_relu.py
│   │   │   │   ├── test_bernoulli.py
│   │   │   │   ├── test_binary_math_ops_dtype.py
│   │   │   │   ├── test_bincount.py
│   │   │   │   ├── test_bitwise.py
│   │   │   │   ├── test_bmm.py
│   │   │   │   ├── test_broadcast_like.py
│   │   │   │   ├── test_broadcast_ops.py
│   │   │   │   ├── test_cast.py
│   │   │   │   ├── test_ceil.py
│   │   │   │   ├── test_check_meta_consistency.py
│   │   │   │   ├── test_checkpointing.py
│   │   │   │   ├── test_chunk.py
│   │   │   │   ├── test_clamp.py
│   │   │   │   ├── test_clip_grad.py
│   │   │   │   ├── test_clone.py
│   │   │   │   ├── test_coco_reader.py
│   │   │   │   ├── test_coin_flip.py
│   │   │   │   ├── test_comb2to2d.py
│   │   │   │   ├── test_combined_margin_loss.py
│   │   │   │   ├── test_comm.py
│   │   │   │   ├── test_comm_ops.py
│   │   │   │   ├── test_concat.py
│   │   │   │   ├── test_constant.py
│   │   │   │   ├── test_constant_pad.py
│   │   │   │   ├── test_contiguous.py
│   │   │   │   ├── test_conv1d.py
│   │   │   │   ├── test_conv2d.py
│   │   │   │   ├── test_copy.py
│   │   │   │   ├── test_cosine_similarity.py
│   │   │   │   ├── test_ctc_greedy_decoder.py
│   │   │   │   ├── test_ctc_loss.py
│   │   │   │   ├── test_cublas_fused_mlp.py
│   │   │   │   ├── test_cum_ops.py
│   │   │   │   ├── test_dataset.py
│   │   │   │   ├── test_ddp.py
│   │   │   │   ├── test_ddp_multi_outputs.py
│   │   │   │   ├── test_deconv2d.py
│   │   │   │   ├── test_default_dtype.py
│   │   │   │   ├── test_deform_conv2d.py
│   │   │   │   ├── test_det.py
│   │   │   │   ├── test_diag.py
│   │   │   │   ├── test_diagonal.py
│   │   │   │   ├── test_div.py
│   │   │   │   ├── test_dlpack.py
│   │   │   │   ├── test_dot.py
│   │   │   │   ├── test_dropout.py
│   │   │   │   ├── test_dynamic_allocation_gradient_shuffle_shuffle_global.py
│   │   │   │   ├── test_eager_boxing.py
│   │   │   │   ├── test_eager_boxing_exhaustive.py
│   │   │   │   ├── test_empty.py
│   │   │   │   ├── test_eq.py
│   │   │   │   ├── test_equal.py
│   │   │   │   ├── test_erf.py
│   │   │   │   ├── test_erfc.py
│   │   │   │   ├── test_erfinv.py
│   │   │   │   ├── test_expand.py
│   │   │   │   ├── test_expand_stride.py
│   │   │   │   ├── test_expm1.py
│   │   │   │   ├── test_eye.py
│   │   │   │   ├── test_fake_quantization.py
│   │   │   │   ├── test_fft.py
│   │   │   │   ├── test_flatten.py
│   │   │   │   ├── test_flip.py
│   │   │   │   ├── test_floor.py
│   │   │   │   ├── test_fmod.py
│   │   │   │   ├── test_fold.py
│   │   │   │   ├── test_fork_sub_process.py
│   │   │   │   ├── test_frac.py
│   │   │   │   ├── test_from_numpy.py
│   │   │   │   ├── test_from_torch.py
│   │   │   │   ├── test_functional_docstr.py
│   │   │   │   ├── test_functional_scalar_tensor_param.py
│   │   │   │   ├── test_fused_attention_ops.py
│   │   │   │   ├── test_fused_bias_add_dropout.py
│   │   │   │   ├── test_fused_bias_add_gelu.py
│   │   │   │   ├── test_fused_bias_add_scale_mask_softmax_dropout.py
│   │   │   │   ├── test_fused_center.py
│   │   │   │   ├── test_fused_codegeex_qkv_reshape.py
│   │   │   │   ├── test_fused_cross_interaction.py
│   │   │   │   ├── test_fused_dot_feature_interaction.py
│   │   │   │   ├── test_fused_gelu_mul.py
│   │   │   │   ├── test_fused_get_boundding_boxes_coord.py
│   │   │   │   ├── test_fused_get_ciou_diagonal_angle.py
│   │   │   │   ├── test_fused_get_ciou_result.py
│   │   │   │   ├── test_fused_get_convex_diagonal_squared.py
│   │   │   │   ├── test_fused_get_intersection_area.py
│   │   │   │   ├── test_fused_get_iou.py
│   │   │   │   ├── test_fused_glu.py
│   │   │   │   ├── test_fused_matmul_bias.py
│   │   │   │   ├── test_fused_matmul_bias_add_relu_dropout.py
│   │   │   │   ├── test_fused_rotary_embedding.py
│   │   │   │   ├── test_fused_scale_mask_bias_softmax.py
│   │   │   │   ├── test_fused_scale_mask_softmax.py
│   │   │   │   ├── test_fused_scale_mask_softmax_dropout.py
│   │   │   │   ├── test_fused_scale_tril.py
│   │   │   │   ├── test_fused_self_attention.py
│   │   │   │   ├── test_fused_tril_softmax_mask_scale.py
│   │   │   │   ├── test_fused_weighted_sum.py
│   │   │   │   ├── test_gather.py
│   │   │   │   ├── test_gather_nd.py
│   │   │   │   ├── test_gelu_approximate.py
│   │   │   │   ├── test_generator.py
│   │   │   │   ├── test_global_0_dim_tensor.py
│   │   │   │   ├── test_global_TripletMarginLoss.py
│   │   │   │   ├── test_global_abs.py
│   │   │   │   ├── test_global_activation.py
│   │   │   │   ├── test_global_adaptive_pool.py
│   │   │   │   ├── test_global_add.py
│   │   │   │   ├── test_global_addcdiv.py
│   │   │   │   ├── test_global_addcmul.py
│   │   │   │   ├── test_global_addmm.py
│   │   │   │   ├── test_global_affine_grid.py
│   │   │   │   ├── test_global_argmax.py
│   │   │   │   ├── test_global_argmin.py
│   │   │   │   ├── test_global_argsort.py
│   │   │   │   ├── test_global_argwhere.py
│   │   │   │   ├── test_global_atleast.py
│   │   │   │   ├── test_global_avgpool.py
│   │   │   │   ├── test_global_batch_gather.py
│   │   │   │   ├── test_global_bincount.py
│   │   │   │   ├── test_global_bitwise.py
│   │   │   │   ├── test_global_broadcase_like.py
│   │   │   │   ├── test_global_broadcast_matmul.py
│   │   │   │   ├── test_global_broadcast_ops.py
│   │   │   │   ├── test_global_cast.py
│   │   │   │   ├── test_global_chunk.py
│   │   │   │   ├── test_global_clone.py
│   │   │   │   ├── test_global_coin_flip.py
│   │   │   │   ├── test_global_concat.py
│   │   │   │   ├── test_global_constant.py
│   │   │   │   ├── test_global_ctc_loss.py
│   │   │   │   ├── test_global_cumprod.py
│   │   │   │   ├── test_global_cumsum.py
│   │   │   │   ├── test_global_deconv2d.py
│   │   │   │   ├── test_global_deform_conv2d.py
│   │   │   │   ├── test_global_det.py
│   │   │   │   ├── test_global_diag.py
│   │   │   │   ├── test_global_diagonal.py
│   │   │   │   ├── test_global_div.py
│   │   │   │   ├── test_global_dot.py
│   │   │   │   ├── test_global_dropout.py
│   │   │   │   ├── test_global_einsum_alphaflod_usecase1.py
│   │   │   │   ├── test_global_einsum_alphaflod_usecase10.py
│   │   │   │   ├── test_global_einsum_alphaflod_usecase11.py
│   │   │   │   ├── test_global_einsum_alphaflod_usecase2.py
│   │   │   │   ├── test_global_einsum_alphaflod_usecase3.py
│   │   │   │   ├── test_global_einsum_alphaflod_usecase4.py
│   │   │   │   ├── test_global_einsum_alphaflod_usecase5.py
│   │   │   │   ├── test_global_einsum_alphaflod_usecase6.py
│   │   │   │   ├── test_global_einsum_alphaflod_usecase7.py
│   │   │   │   ├── test_global_einsum_alphaflod_usecase8.py
│   │   │   │   ├── test_global_einsum_alphaflod_usecase9.py
│   │   │   │   ├── test_global_einsum_attention.py
│   │   │   │   ├── test_global_einsum_batch_matmul.py
│   │   │   │   ├── test_global_einsum_batch_matmul2.py
│   │   │   │   ├── test_global_einsum_batch_matmul3.py
│   │   │   │   ├── test_global_einsum_batch_matmul4.py
│   │   │   │   ├── test_global_einsum_batch_matrix_vector_multiply.py
│   │   │   │   ├── test_global_einsum_batch_permute.py
│   │   │   │   ├── test_global_einsum_bilinear_transformation.py
│   │   │   │   ├── test_global_einsum_eltwise_mul_sum_row.py
│   │   │   │   ├── test_global_einsum_eltwise_mul_then_reduce_sum.py
│   │   │   │   ├── test_global_einsum_eltwise_multiply.py
│   │   │   │   ├── test_global_einsum_get_diagonal.py
│   │   │   │   ├── test_global_einsum_matmul.py
│   │   │   │   ├── test_global_einsum_matmul2.py
│   │   │   │   ├── test_global_einsum_matrix_column_sum.py
│   │   │   │   ├── test_global_einsum_matrix_transpose.py
│   │   │   │   ├── test_global_einsum_matrix_vector_multiply.py
│   │   │   │   ├── test_global_einsum_reduce_sum.py
│   │   │   │   ├── test_global_einsum_tensor_contraction.py
│   │   │   │   ├── test_global_einsum_tensor_contraction2.py
│   │   │   │   ├── test_global_einsum_vector_inner_product.py
│   │   │   │   ├── test_global_einsum_vector_outer_product.py
│   │   │   │   ├── test_global_empty.py
│   │   │   │   ├── test_global_eq.py
│   │   │   │   ├── test_global_erf.py
│   │   │   │   ├── test_global_erfc.py
│   │   │   │   ├── test_global_expand_op.py
│   │   │   │   ├── test_global_expm1.py
│   │   │   │   ├── test_global_eye.py
│   │   │   │   ├── test_global_fill.py
│   │   │   │   ├── test_global_flatten.py
│   │   │   │   ├── test_global_flip.py
│   │   │   │   ├── test_global_floor.py
│   │   │   │   ├── test_global_fmod.py
│   │   │   │   ├── test_global_fold.py
│   │   │   │   ├── test_global_frac.py
│   │   │   │   ├── test_global_full.py
│   │   │   │   ├── test_global_full_like.py
│   │   │   │   ├── test_global_greater.py
│   │   │   │   ├── test_global_greater_equal.py
│   │   │   │   ├── test_global_grid_sample.py
│   │   │   │   ├── test_global_groupnorm.py
│   │   │   │   ├── test_global_gru_cell.py
│   │   │   │   ├── test_global_hann_window.py
│   │   │   │   ├── test_global_higher_derivative_activation.py
│   │   │   │   ├── test_global_higher_derivative_conv.py
│   │   │   │   ├── test_global_higher_derivative_div.py
│   │   │   │   ├── test_global_higher_derivative_loss.py
│   │   │   │   ├── test_global_higher_derivative_matmul.py
│   │   │   │   ├── test_global_higher_derivative_neg.py
│   │   │   │   ├── test_global_higher_derivative_pool.py
│   │   │   │   ├── test_global_higher_derivative_pow.py
│   │   │   │   ├── test_global_higher_derivative_scalar_pow.py
│   │   │   │   ├── test_global_higher_derivative_slice.py
│   │   │   │   ├── test_global_higher_derivative_softmax.py
│   │   │   │   ├── test_global_inv.py
│   │   │   │   ├── test_global_lerp.py
│   │   │   │   ├── test_global_linalg_cross.py
│   │   │   │   ├── test_global_linear.py
│   │   │   │   ├── test_global_linspace.py
│   │   │   │   ├── test_global_logspace.py
│   │   │   │   ├── test_global_lstm_cell.py
│   │   │   │   ├── test_global_masked_fill.py
│   │   │   │   ├── test_global_masked_select.py
│   │   │   │   ├── test_global_math_op_higher_derivative.py
│   │   │   │   ├── test_global_math_ops.py
│   │   │   │   ├── test_global_matmul.py
│   │   │   │   ├── test_global_max.py
│   │   │   │   ├── test_global_maximum_minimum.py
│   │   │   │   ├── test_global_maxpool.py
│   │   │   │   ├── test_global_maxunpool.py
│   │   │   │   ├── test_global_mean.py
│   │   │   │   ├── test_global_median.py
│   │   │   │   ├── test_global_meshgrid.py
│   │   │   │   ├── test_global_min.py
│   │   │   │   ├── test_global_min_max_observer.py
│   │   │   │   ├── test_global_movedim.py
│   │   │   │   ├── test_global_moving_average_max_min_observer.py
│   │   │   │   ├── test_global_mul.py
│   │   │   │   ├── test_global_mv.py
│   │   │   │   ├── test_global_nansum.py
│   │   │   │   ├── test_global_narrow.py
│   │   │   │   ├── test_global_ne.py
│   │   │   │   ├── test_global_negative.py
│   │   │   │   ├── test_global_nms.py
│   │   │   │   ├── test_global_normal.py
│   │   │   │   ├── test_global_normalize.py
│   │   │   │   ├── test_global_nozero.py
│   │   │   │   ├── test_global_ones_like.py
│   │   │   │   ├── test_global_pad.py
│   │   │   │   ├── test_global_partical_fc.py
│   │   │   │   ├── test_global_permute.py
│   │   │   │   ├── test_global_rand.py
│   │   │   │   ├── test_global_randint.py
│   │   │   │   ├── test_global_randint_like.py
│   │   │   │   ├── test_global_randn.py
│   │   │   │   ├── test_global_random_op_data.py
│   │   │   │   ├── test_global_randperm.py
│   │   │   │   ├── test_global_reciprocal.py
│   │   │   │   ├── test_global_reflection_pad2d.py
│   │   │   │   ├── test_global_repeat.py
│   │   │   │   ├── test_global_replication_pad2d.py
│   │   │   │   ├── test_global_reshape.py
│   │   │   │   ├── test_global_rnn.py
│   │   │   │   ├── test_global_rnn_cell.py
│   │   │   │   ├── test_global_roi_align.py
│   │   │   │   ├── test_global_roll.py
│   │   │   │   ├── test_global_round.py
│   │   │   │   ├── test_global_scatter_nd.py
│   │   │   │   ├── test_global_scatter_ops.py
│   │   │   │   ├── test_global_searchsorted.py
│   │   │   │   ├── test_global_sign.py
│   │   │   │   ├── test_global_slice.py
│   │   │   │   ├── test_global_slice_update.py
│   │   │   │   ├── test_global_sort.py
│   │   │   │   ├── test_global_sparse.py
│   │   │   │   ├── test_global_sparse_softmax_cross_entropy.py
│   │   │   │   ├── test_global_split.py
│   │   │   │   ├── test_global_sqrt_square_sum.py
│   │   │   │   ├── test_global_squeeze.py
│   │   │   │   ├── test_global_stack.py
│   │   │   │   ├── test_global_stateful_kernel_with_cache.py
│   │   │   │   ├── test_global_std.py
│   │   │   │   ├── test_global_sub.py
│   │   │   │   ├── test_global_sum.py
│   │   │   │   ├── test_global_tensor_new.py
│   │   │   │   ├── test_global_tensor_ops.py
│   │   │   │   ├── test_global_tensor_scatter_nd_update.py
│   │   │   │   ├── test_global_tensordot.py
│   │   │   │   ├── test_global_tile.py
│   │   │   │   ├── test_global_transpose.py
│   │   │   │   ├── test_global_tril.py
│   │   │   │   ├── test_global_triu.py
│   │   │   │   ├── test_global_unbind.py
│   │   │   │   ├── test_global_unfold.py
│   │   │   │   ├── test_global_unfold_tensor.py
│   │   │   │   ├── test_global_unique.py
│   │   │   │   ├── test_global_unsqueeze.py
│   │   │   │   ├── test_global_upsample.py
│   │   │   │   ├── test_global_var.py
│   │   │   │   ├── test_global_vector_matrix_product.py
│   │   │   │   ├── test_global_view.py
│   │   │   │   ├── test_global_weight_norm.py
│   │   │   │   ├── test_global_where.py
│   │   │   │   ├── test_global_zeropad2d.py
│   │   │   │   ├── test_global_zeros_like.py
│   │   │   │   ├── test_glu.py
│   │   │   │   ├── test_gpt_data_loader.py
│   │   │   │   ├── test_greater.py
│   │   │   │   ├── test_greater_equal.py
│   │   │   │   ├── test_grid_sample.py
│   │   │   │   ├── test_grouped_matmul_bias.py
│   │   │   │   ├── test_groupnorm.py
│   │   │   │   ├── test_groupwise_quantization.py
│   │   │   │   ├── test_gumbel_softmax.py
│   │   │   │   ├── test_hann_window.py
│   │   │   │   ├── test_higher_derivative_activation.py
│   │   │   │   ├── test_higher_derivative_conv.py
│   │   │   │   ├── test_higher_derivative_div.py
│   │   │   │   ├── test_higher_derivative_loss.py
│   │   │   │   ├── test_higher_derivative_matmul.py
│   │   │   │   ├── test_higher_derivative_neg.py
│   │   │   │   ├── test_higher_derivative_pool.py
│   │   │   │   ├── test_higher_derivative_pow.py
│   │   │   │   ├── test_higher_derivative_scalar_pow.py
│   │   │   │   ├── test_higher_derivative_slice.py
│   │   │   │   ├── test_higher_derivative_softmax.py
│   │   │   │   ├── test_host_memory_input.py
│   │   │   │   ├── test_hsplit.py
│   │   │   │   ├── test_hub.py
│   │   │   │   ├── test_image_batch_align.py
│   │   │   │   ├── test_image_decode.py
│   │   │   │   ├── test_image_flip.py
│   │   │   │   ├── test_image_normalize.py
│   │   │   │   ├── test_image_resize.py
│   │   │   │   ├── test_in_top_k.py
│   │   │   │   ├── test_index_add.py
│   │   │   │   ├── test_index_select.py
│   │   │   │   ├── test_info.py
│   │   │   │   ├── test_initializer.py
│   │   │   │   ├── test_instancenorm.py
│   │   │   │   ├── test_interpolate.py
│   │   │   │   ├── test_inv.py
│   │   │   │   ├── test_isclose.py
│   │   │   │   ├── test_jit_script_api.py
│   │   │   │   ├── test_layer_norm.py
│   │   │   │   ├── test_lerp.py
│   │   │   │   ├── test_less.py
│   │   │   │   ├── test_less_equal.py
│   │   │   │   ├── test_linalg_cross.py
│   │   │   │   ├── test_linear.py
│   │   │   │   ├── test_linspace.py
│   │   │   │   ├── test_log1p.py
│   │   │   │   ├── test_logaddexp.py
│   │   │   │   ├── test_logical_and.py
│   │   │   │   ├── test_logical_not.py
│   │   │   │   ├── test_logical_or.py
│   │   │   │   ├── test_logical_reduce.py
│   │   │   │   ├── test_logical_xor.py
│   │   │   │   ├── test_logspace.py
│   │   │   │   ├── test_logsumexp.py
│   │   │   │   ├── test_loss.py
│   │   │   │   ├── test_loss_global.py
│   │   │   │   ├── test_lr_scheduler.py
│   │   │   │   ├── test_masked_fill.py
│   │   │   │   ├── test_masked_select.py
│   │   │   │   ├── test_math_op_higher_derivative.py
│   │   │   │   ├── test_math_ops.py
│   │   │   │   ├── test_matmul.py
│   │   │   │   ├── test_max.py
│   │   │   │   ├── test_maxpool.py
│   │   │   │   ├── test_maxunpool.py
│   │   │   │   ├── test_mean.py
│   │   │   │   ├── test_median.py
│   │   │   │   ├── test_meshgrid.py
│   │   │   │   ├── test_min.py
│   │   │   │   ├── test_min_max_observer.py
│   │   │   │   ├── test_mock.py
│   │   │   │   ├── test_mode.py
│   │   │   │   ├── test_module.py
│   │   │   │   ├── test_module_to.py
│   │   │   │   ├── test_module_to_global_or_local.py
│   │   │   │   ├── test_module_to_half.py
│   │   │   │   ├── test_movedim.py
│   │   │   │   ├── test_moving_average_min_max_observer.py
│   │   │   │   ├── test_mul.py
│   │   │   │   ├── test_multi_tensor_yolov5_weight_update.py
│   │   │   │   ├── test_multinomial.py
│   │   │   │   ├── test_nansum.py
│   │   │   │   ├── test_narrow.py
│   │   │   │   ├── test_ne.py
│   │   │   │   ├── test_negative.py
│   │   │   │   ├── test_nll_loss.py
│   │   │   │   ├── test_nms.py
│   │   │   │   ├── test_noncontiguous_binary_op.py
│   │   │   │   ├── test_nonzero.py
│   │   │   │   ├── test_norm.py
│   │   │   │   ├── test_normalize.py
│   │   │   │   ├── test_ofrecord_reader.py
│   │   │   │   ├── test_one_embedding_adagrad.py
│   │   │   │   ├── test_one_embedding_adam.py
│   │   │   │   ├── test_one_embedding_ftrl.py
│   │   │   │   ├── test_one_embedding_sgd.py
│   │   │   │   ├── test_one_hot.py
│   │   │   │   ├── test_ones_like.py
│   │   │   │   ├── test_optim_adadelta.py
│   │   │   │   ├── test_optim_adagrad.py
│   │   │   │   ├── test_optim_adam.py
│   │   │   │   ├── test_optim_adamw.py
│   │   │   │   ├── test_optim_add_param_group.py
│   │   │   │   ├── test_optim_ftrl.py
│   │   │   │   ├── test_optim_lamb.py
│   │   │   │   ├── test_optim_lbfgs.py
│   │   │   │   ├── test_optim_rmsprop.py
│   │   │   │   ├── test_optim_sgd.py
│   │   │   │   ├── test_pairwise_distance.py
│   │   │   │   ├── test_param_group.py
│   │   │   │   ├── test_parameters_grouping.py
│   │   │   │   ├── test_parital_fc.py
│   │   │   │   ├── test_pixel_shuffle.py
│   │   │   │   ├── test_prelu.py
│   │   │   │   ├── test_prod.py
│   │   │   │   ├── test_pruning.py
│   │   │   │   ├── test_qat_conv_modules.py
│   │   │   │   ├── test_quantile.py
│   │   │   │   ├── test_quantization.py
│   │   │   │   ├── test_quick_gelu.py
│   │   │   │   ├── test_rand.py
│   │   │   │   ├── test_randint.py
│   │   │   │   ├── test_randint_like.py
│   │   │   │   ├── test_randn.py
│   │   │   │   ├── test_randn_like.py
│   │   │   │   ├── test_random_generator_and_seed.py
│   │   │   │   ├── test_randperm.py
│   │   │   │   ├── test_reciprocal.py
│   │   │   │   ├── test_reduce.py
│   │   │   │   ├── test_reduce_sum_like.py
│   │   │   │   ├── test_reflection_pad.py
│   │   │   │   ├── test_repeat.py
│   │   │   │   ├── test_repeat_interleave.py
│   │   │   │   ├── test_replication_pad.py
│   │   │   │   ├── test_reshape.py
│   │   │   │   ├── test_reshape_sbp.py
│   │   │   │   ├── test_resnet_load_torch_weight_compatibile.py
│   │   │   │   ├── test_rmsnorm.py
│   │   │   │   ├── test_roc_auc_score.py
│   │   │   │   ├── test_roi_align.py
│   │   │   │   ├── test_roll.py
│   │   │   │   ├── test_round.py
│   │   │   │   ├── test_rrelu.py
│   │   │   │   ├── test_save_load.py
│   │   │   │   ├── test_saved_tensor_hooks.py
│   │   │   │   ├── test_sbp_symbol.py
│   │   │   │   ├── test_scatter_nd.py
│   │   │   │   ├── test_scatter_ops.py
│   │   │   │   ├── test_searchsorted.py
│   │   │   │   ├── test_select.py
│   │   │   │   ├── test_shutting_down.py
│   │   │   │   ├── test_sign.py
│   │   │   │   ├── test_single_threaded_vm.py
│   │   │   │   ├── test_skip_layer_norm.py
│   │   │   │   ├── test_skip_rms_norm.py
│   │   │   │   ├── test_slice.py
│   │   │   │   ├── test_softmax.py
│   │   │   │   ├── test_softplus.py
│   │   │   │   ├── test_sort.py
│   │   │   │   ├── test_sparse.py
│   │   │   │   ├── test_sparse_softmax_cross_entropy.py
│   │   │   │   ├── test_special_ops.py
│   │   │   │   ├── test_split.py
│   │   │   │   ├── test_square_relu.py
│   │   │   │   ├── test_squeeze.py
│   │   │   │   ├── test_stack.py
│   │   │   │   ├── test_stateful_kernel_with_cache.py
│   │   │   │   ├── test_stateful_local_opkernel.py
│   │   │   │   ├── test_std.py
│   │   │   │   ├── test_stft.py
│   │   │   │   ├── test_sub.py
│   │   │   │   ├── test_sum.py
│   │   │   │   ├── test_swapaxes.py
│   │   │   │   ├── test_swapdims.py
│   │   │   │   ├── test_swautils.py
│   │   │   │   ├── test_sync_and_async_allreduce.py
│   │   │   │   ├── test_sync_batchnorm.py
│   │   │   │   ├── test_t.py
│   │   │   │   ├── test_t5_layernorm.py
│   │   │   │   ├── test_tensor_buffer.py
│   │   │   │   ├── test_tensor_ops.py
│   │   │   │   ├── test_tensor_scatter_nd_update.py
│   │   │   │   ├── test_tensor_split.py
│   │   │   │   ├── test_tensor_to.py
│   │   │   │   ├── test_tensordot.py
│   │   │   │   ├── test_tile.py
│   │   │   │   ├── test_to_torch.py
│   │   │   │   ├── test_topk.py
│   │   │   │   ├── test_transpose.py
│   │   │   │   ├── test_tril.py
│   │   │   │   ├── test_triu.py
│   │   │   │   ├── test_trunc.py
│   │   │   │   ├── test_trunc_divide.py
│   │   │   │   ├── test_type_tensor.py
│   │   │   │   ├── test_unbind.py
│   │   │   │   ├── test_unfold.py
│   │   │   │   ├── test_unfold_tensor.py
│   │   │   │   ├── test_unique.py
│   │   │   │   ├── test_unsqueeze.py
│   │   │   │   ├── test_upsample.py
│   │   │   │   ├── test_util_ops.py
│   │   │   │   ├── test_utils.py
│   │   │   │   ├── test_var.py
│   │   │   │   ├── test_view.py
│   │   │   │   ├── test_vsplit.py
│   │   │   │   ├── test_weight_norm.py
│   │   │   │   ├── test_where.py
│   │   │   │   └── test_zeropad2d.py
│   │   │   ├── profiler/
│   │   │   │   ├── test_events.py
│   │   │   │   └── test_profile_lenet.py
│   │   │   └── tensor/
│   │   │       ├── test_autocast.py
│   │   │       ├── test_bfloat16_activation.py
│   │   │       ├── test_complex.py
│   │   │       ├── test_data_ptr.py
│   │   │       ├── test_global_tensor.py
│   │   │       ├── test_global_tensor_and_ndarray_compatibility.py
│   │   │       ├── test_global_tensor_indexing.py
│   │   │       ├── test_lazy_tensor_indexing.py
│   │   │       ├── test_meta_tensor.py
│   │   │       ├── test_new_tensor.py
│   │   │       ├── test_parameter.py
│   │   │       ├── test_safetensors.py
│   │   │       ├── test_tensor_and_ndarray_compatibility.py
│   │   │       ├── test_tensor_exponential.py
│   │   │       ├── test_tensor_indexing.py
│   │   │       ├── test_tensor_indexing2.py
│   │   │       ├── test_tensor_is_view.py
│   │   │       ├── test_tensor_part_1.py
│   │   │       ├── test_tensor_part_2.py
│   │   │       ├── test_tensor_part_3.py
│   │   │       ├── test_tensor_pin_memory.py
│   │   │       └── test_tensor_to_memory_format.py
│   │   ├── test_utils/
│   │   │   ├── __init__.py
│   │   │   ├── automated_test_util/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── generators.py
│   │   │   │   ├── global_scope.py
│   │   │   │   ├── profiler.py
│   │   │   │   ├── torch_flow_dual_object.py
│   │   │   │   └── util.py
│   │   │   ├── oneflow_pytorch_compatibility/
│   │   │   │   ├── __init__.py
│   │   │   │   └── oneflow_pytorch_compatiblity_test.py
│   │   │   ├── test_util.py
│   │   │   └── throttle.py
│   │   ├── unittest/
│   │   │   ├── __init__.py
│   │   │   ├── dataset.py
│   │   │   ├── env.py
│   │   │   └── mlir.py
│   │   └── utils/
│   │       ├── __init__.py
│   │       ├── checkpoint.py
│   │       ├── data/
│   │       │   ├── __init__.py
│   │       │   ├── _utils/
│   │       │   │   ├── __init__.py
│   │       │   │   ├── collate.py
│   │       │   │   ├── fetch.py
│   │       │   │   ├── pin_memory.py
│   │       │   │   ├── signal_handling.py
│   │       │   │   └── worker.py
│   │       │   ├── dataloader.py
│   │       │   ├── dataset.py
│   │       │   ├── decorator.py
│   │       │   ├── distributed.py
│   │       │   └── sampler.py
│   │       ├── global_view/
│   │       │   ├── __init__.py
│   │       │   ├── global_mode.py
│   │       │   ├── global_utils.py
│   │       │   ├── to_global.py
│   │       │   └── to_local.py
│   │       ├── hooks.py
│   │       ├── insight/
│   │       │   ├── README.md
│   │       │   ├── requirements.txt
│   │       │   └── sqlite_to_google_trace_event.py
│   │       ├── model_zoo.py
│   │       └── tensor/
│   │           ├── __init__.py
│   │           └── from_or_to_torch_tensor.py
│   └── setup.py
└── tools/
    ├── check_src.py
    ├── clean_generated_api.py
    ├── create_pip_index.py
    ├── flags_from_git_diff.py
    ├── functional/
    │   ├── generate_dispatch_stateful_ops.py
    │   ├── generate_functional_api.py
    │   ├── generate_tensor_api.py
    │   └── generator.py
    ├── generate_header_list.py
    ├── generate_pip_version.py
    ├── oneflow-tblgen/
    │   ├── CMakeLists.txt
    │   ├── backends.h
    │   ├── example/
    │   │   └── constant.td
    │   ├── op_schema_emitter.cpp
    │   ├── op_schema_header.inc
    │   ├── op_schema_source.inc
    │   ├── op_schema_types.inc
    │   └── tablegen.cpp
    ├── oss_file_exist.py
    └── package_mirror.py